00001
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <xmmintrin.h>
00036
00037 void filter_mem2_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00038 {
00039 __m128 num[3], den[3], mem[3];
00040
00041 int i;
00042
00043
00044 for (i=0;i<2;i++)
00045 {
00046 mem[i] = _mm_loadu_ps(_mem+4*i);
00047 num[i] = _mm_loadu_ps(_num+4*i);
00048 den[i] = _mm_loadu_ps(_den+4*i);
00049 }
00050 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00051 num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
00052 den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
00053
00054 for (i=0;i<N;i++)
00055 {
00056 __m128 xx;
00057 __m128 yy;
00058
00059 xx = _mm_load_ps1(x+i);
00060 yy = _mm_add_ss(xx, mem[0]);
00061 _mm_store_ss(y+i, yy);
00062 yy = _mm_shuffle_ps(yy, yy, 0);
00063
00064
00065 mem[0] = _mm_move_ss(mem[0], mem[1]);
00066 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00067
00068 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00069 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00070
00071 mem[1] = _mm_move_ss(mem[1], mem[2]);
00072 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00073
00074 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00075 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00076
00077 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00078
00079 mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00080 mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00081 }
00082
00083 _mm_storeu_ps(_mem, mem[0]);
00084 _mm_storeu_ps(_mem+4, mem[1]);
00085 _mm_store_ss(_mem+8, mem[2]);
00086 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00087 _mm_store_ss(_mem+9, mem[2]);
00088 }
00089
00090 void filter_mem2_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00091 {
00092 __m128 num[2], den[2], mem[2];
00093
00094 int i;
00095
00096
00097 for (i=0;i<2;i++)
00098 {
00099 mem[i] = _mm_loadu_ps(_mem+4*i);
00100 num[i] = _mm_loadu_ps(_num+4*i);
00101 den[i] = _mm_loadu_ps(_den+4*i);
00102 }
00103
00104 for (i=0;i<N;i++)
00105 {
00106 __m128 xx;
00107 __m128 yy;
00108
00109 xx = _mm_load_ps1(x+i);
00110 yy = _mm_add_ss(xx, mem[0]);
00111 _mm_store_ss(y+i, yy);
00112 yy = _mm_shuffle_ps(yy, yy, 0);
00113
00114
00115 mem[0] = _mm_move_ss(mem[0], mem[1]);
00116 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00117
00118 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00119 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00120
00121 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00122 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00123
00124 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00125 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00126 }
00127
00128 _mm_storeu_ps(_mem, mem[0]);
00129 _mm_storeu_ps(_mem+4, mem[1]);
00130 }
00131
00132
00133 #define OVERRIDE_FILTER_MEM2
00134 void filter_mem2(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00135 {
00136 if(ord==10)
00137 filter_mem2_10(x, _num, _den, y, N, ord, _mem);
00138 else if (ord==8)
00139 filter_mem2_8(x, _num, _den, y, N, ord, _mem);
00140 }
00141
00142
00143
00144 void iir_mem2_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00145 {
00146 __m128 den[3], mem[3];
00147
00148 int i;
00149
00150
00151 for (i=0;i<2;i++)
00152 {
00153 mem[i] = _mm_loadu_ps(_mem+4*i);
00154 den[i] = _mm_loadu_ps(_den+4*i);
00155 }
00156 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00157 den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
00158
00159 for (i=0;i<N;i++)
00160 {
00161 __m128 xx;
00162 __m128 yy;
00163
00164 xx = _mm_load_ps1(x+i);
00165 yy = _mm_add_ss(xx, mem[0]);
00166 _mm_store_ss(y+i, yy);
00167 yy = _mm_shuffle_ps(yy, yy, 0);
00168
00169
00170 mem[0] = _mm_move_ss(mem[0], mem[1]);
00171 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00172
00173 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00174
00175 mem[1] = _mm_move_ss(mem[1], mem[2]);
00176 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00177
00178 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00179
00180 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00181
00182 mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00183 }
00184
00185 _mm_storeu_ps(_mem, mem[0]);
00186 _mm_storeu_ps(_mem+4, mem[1]);
00187 _mm_store_ss(_mem+8, mem[2]);
00188 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00189 _mm_store_ss(_mem+9, mem[2]);
00190 }
00191
00192
00193 void iir_mem2_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00194 {
00195 __m128 den[2], mem[2];
00196
00197 int i;
00198
00199
00200 for (i=0;i<2;i++)
00201 {
00202 mem[i] = _mm_loadu_ps(_mem+4*i);
00203 den[i] = _mm_loadu_ps(_den+4*i);
00204 }
00205
00206 for (i=0;i<N;i++)
00207 {
00208 __m128 xx;
00209 __m128 yy;
00210
00211 xx = _mm_load_ps1(x+i);
00212 yy = _mm_add_ss(xx, mem[0]);
00213 _mm_store_ss(y+i, yy);
00214 yy = _mm_shuffle_ps(yy, yy, 0);
00215
00216
00217 mem[0] = _mm_move_ss(mem[0], mem[1]);
00218 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00219
00220 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00221
00222 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00223 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00224
00225 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00226 }
00227
00228 _mm_storeu_ps(_mem, mem[0]);
00229 _mm_storeu_ps(_mem+4, mem[1]);
00230 }
00231
00232 #define OVERRIDE_IIR_MEM2
00233 void iir_mem2(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00234 {
00235 if(ord==10)
00236 iir_mem2_10(x, _den, y, N, ord, _mem);
00237 else if (ord==8)
00238 iir_mem2_8(x, _den, y, N, ord, _mem);
00239 }
00240
00241
00242 void fir_mem2_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00243 {
00244 __m128 num[3], mem[3];
00245
00246 int i;
00247
00248
00249 for (i=0;i<2;i++)
00250 {
00251 mem[i] = _mm_loadu_ps(_mem+4*i);
00252 num[i] = _mm_loadu_ps(_num+4*i);
00253 }
00254 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00255 num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
00256
00257 for (i=0;i<N;i++)
00258 {
00259 __m128 xx;
00260 __m128 yy;
00261
00262 xx = _mm_load_ps1(x+i);
00263 yy = _mm_add_ss(xx, mem[0]);
00264 _mm_store_ss(y+i, yy);
00265 yy = _mm_shuffle_ps(yy, yy, 0);
00266
00267
00268 mem[0] = _mm_move_ss(mem[0], mem[1]);
00269 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00270
00271 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00272
00273 mem[1] = _mm_move_ss(mem[1], mem[2]);
00274 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00275
00276 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00277
00278 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00279
00280 mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00281 }
00282
00283 _mm_storeu_ps(_mem, mem[0]);
00284 _mm_storeu_ps(_mem+4, mem[1]);
00285 _mm_store_ss(_mem+8, mem[2]);
00286 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00287 _mm_store_ss(_mem+9, mem[2]);
00288 }
00289
00290 void fir_mem2_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00291 {
00292 __m128 num[2], mem[2];
00293
00294 int i;
00295
00296
00297 for (i=0;i<2;i++)
00298 {
00299 mem[i] = _mm_loadu_ps(_mem+4*i);
00300 num[i] = _mm_loadu_ps(_num+4*i);
00301 }
00302
00303 for (i=0;i<N;i++)
00304 {
00305 __m128 xx;
00306 __m128 yy;
00307
00308 xx = _mm_load_ps1(x+i);
00309 yy = _mm_add_ss(xx, mem[0]);
00310 _mm_store_ss(y+i, yy);
00311 yy = _mm_shuffle_ps(yy, yy, 0);
00312
00313
00314 mem[0] = _mm_move_ss(mem[0], mem[1]);
00315 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00316
00317 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00318
00319 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00320 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00321
00322 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00323 }
00324
00325 _mm_storeu_ps(_mem, mem[0]);
00326 _mm_storeu_ps(_mem+4, mem[1]);
00327 }
00328
00329 #define OVERRIDE_FIR_MEM2
00330 void fir_mem2(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00331 {
00332 if(ord==10)
00333 fir_mem2_10(x, _num, y, N, ord, _mem);
00334 else if (ord==8)
00335 fir_mem2_8(x, _num, y, N, ord, _mem);
00336 }