00001
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <xmmintrin.h>
00036
00037 #define OVERRIDE_INNER_PROD
00038 static float inner_prod(const float *a, const float *b, int len)
00039 {
00040 int i;
00041 float ret;
00042 __m128 sum = _mm_setzero_ps();
00043 for (i=0;i<(len>>2);i+=2)
00044 {
00045 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+0), _mm_loadu_ps(b+0)));
00046 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+4), _mm_loadu_ps(b+4)));
00047 a += 8;
00048 b += 8;
00049 }
00050 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00051 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00052 _mm_store_ss(&ret, sum);
00053 return ret;
00054 }
00055
00056 #define OVERRIDE_PITCH_XCORR
00057 static void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack)
00058 {
00059 int i, offset;
00060 VARDECL(__m128 *x);
00061 VARDECL(__m128 *y);
00062 int N, L;
00063 N = len>>2;
00064 L = nb_pitch>>2;
00065 ALLOC(x, N, __m128);
00066 ALLOC(y, N+L, __m128);
00067 for (i=0;i<N;i++)
00068 x[i] = _mm_loadu_ps(_x+(i<<2));
00069 for (offset=0;offset<4;offset++)
00070 {
00071 for (i=0;i<N+L;i++)
00072 y[i] = _mm_loadu_ps(_y+(i<<2)+offset);
00073 for (i=0;i<L;i++)
00074 {
00075 int j;
00076 __m128 sum, *xx, *yy;
00077 sum = _mm_setzero_ps();
00078 yy = y+i;
00079 xx = x;
00080 for (j=0;j<N;j+=2)
00081 {
00082 sum = _mm_add_ps(sum, _mm_mul_ps(xx[0], yy[0]));
00083 sum = _mm_add_ps(sum, _mm_mul_ps(xx[1], yy[1]));
00084 xx += 2;
00085 yy += 2;
00086 }
00087 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00088 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00089 _mm_store_ss(corr+nb_pitch-1-(i<<2)-offset, sum);
00090 }
00091 }
00092 }