00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include <xmmintrin.h>
00034
00035 static float inner_prod(const float *a, const float *b, int len)
00036 {
00037 int i;
00038 float ret;
00039 __m128 sum = _mm_setzero_ps();
00040 for (i=0;i<(len>>2);i+=2)
00041 {
00042 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+0), _mm_loadu_ps(b+0)));
00043 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+4), _mm_loadu_ps(b+4)));
00044 a += 8;
00045 b += 8;
00046 }
00047 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00048 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00049 _mm_store_ss(&ret, sum);
00050 return ret;
00051 }
00052
00053 static void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack)
00054 {
00055 int i, offset;
00056 __m128 *x, *y;
00057 int N, L;
00058 N = len>>2;
00059 L = nb_pitch>>2;
00060 x = PUSH(stack, N, __m128);
00061 y = PUSH(stack, N+L, __m128);
00062 for (i=0;i<N;i++)
00063 x[i] = _mm_loadu_ps(_x+(i<<2));
00064 for (offset=0;offset<4;offset++)
00065 {
00066 for (i=0;i<N+L;i++)
00067 y[i] = _mm_loadu_ps(_y+(i<<2)+offset);
00068 for (i=0;i<L;i++)
00069 {
00070 int j;
00071 __m128 sum, *xx, *yy;
00072 sum = _mm_setzero_ps();
00073 yy = y+i;
00074 xx = x;
00075 for (j=0;j<N;j+=2)
00076 {
00077 sum = _mm_add_ps(sum, _mm_mul_ps(xx[0], yy[0]));
00078 sum = _mm_add_ps(sum, _mm_mul_ps(xx[1], yy[1]));
00079 xx += 2;
00080 yy += 2;
00081 }
00082 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00083 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00084 _mm_store_ss(corr+nb_pitch-1-(i<<2)-offset, sum);
00085 }
00086 }
00087 }