Main Page | Class List | Directories | File List | Class Members | File Members

filters_sse.h

Go to the documentation of this file.
00001 /* Copyright (C) 2002 Jean-Marc Valin 
00002    File: filters.c
00003    Various analysis/synthesis filters
00004 
00005    Redistribution and use in source and binary forms, with or without
00006    modification, are permitted provided that the following conditions
00007    are met:
00008    
00009    - Redistributions of source code must retain the above copyright
00010    notice, this list of conditions and the following disclaimer.
00011    
00012    - Redistributions in binary form must reproduce the above copyright
00013    notice, this list of conditions and the following disclaimer in the
00014    documentation and/or other materials provided with the distribution.
00015    
00016    - Neither the name of the Xiph.org Foundation nor the names of its
00017    contributors may be used to endorse or promote products derived from
00018    this software without specific prior written permission.
00019    
00020    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00021    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00022    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00023    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
00024    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00025    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00026    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00027    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00028    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00029    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00030    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031 */
00032 
00033 #include <xmmintrin.h>
00034 
00035 void filter_mem2_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00036 {
00037    __m128 num[3], den[3], mem[3];
00038 
00039    int i;
00040 
00041    /* Copy numerator, denominator and memory to aligned xmm */
00042    for (i=0;i<2;i++)
00043    {
00044       mem[i] = _mm_loadu_ps(_mem+4*i);
00045       num[i] = _mm_loadu_ps(_num+4*i+1);
00046       den[i] = _mm_loadu_ps(_den+4*i+1);
00047    }
00048    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00049    num[2] = _mm_setr_ps(_num[9], _num[10], 0, 0);
00050    den[2] = _mm_setr_ps(_den[9], _den[10], 0, 0);
00051    
00052    for (i=0;i<N;i++)
00053    {
00054       __m128 xx;
00055       __m128 yy;
00056       /* Compute next filter result */
00057       xx = _mm_load_ps1(x+i);
00058       yy = _mm_add_ss(xx, mem[0]);
00059       _mm_store_ss(y+i, yy);
00060       yy = _mm_shuffle_ps(yy, yy, 0);
00061       
00062       /* Update memory */
00063       mem[0] = _mm_move_ss(mem[0], mem[1]);
00064       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00065 
00066       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00067       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00068 
00069       mem[1] = _mm_move_ss(mem[1], mem[2]);
00070       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00071 
00072       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00073       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00074 
00075       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00076 
00077       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00078       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00079    }
00080    /* Put memory back in its place */
00081    _mm_storeu_ps(_mem, mem[0]);
00082    _mm_storeu_ps(_mem+4, mem[1]);
00083    _mm_store_ss(_mem+8, mem[2]);
00084    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00085    _mm_store_ss(_mem+9, mem[2]);
00086 }
00087 
00088 void filter_mem2_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00089 {
00090    __m128 num[2], den[2], mem[2];
00091 
00092    int i;
00093 
00094    /* Copy numerator, denominator and memory to aligned xmm */
00095    for (i=0;i<2;i++)
00096    {
00097       mem[i] = _mm_loadu_ps(_mem+4*i);
00098       num[i] = _mm_loadu_ps(_num+4*i+1);
00099       den[i] = _mm_loadu_ps(_den+4*i+1);
00100    }
00101    
00102    for (i=0;i<N;i++)
00103    {
00104       __m128 xx;
00105       __m128 yy;
00106       /* Compute next filter result */
00107       xx = _mm_load_ps1(x+i);
00108       yy = _mm_add_ss(xx, mem[0]);
00109       _mm_store_ss(y+i, yy);
00110       yy = _mm_shuffle_ps(yy, yy, 0);
00111       
00112       /* Update memory */
00113       mem[0] = _mm_move_ss(mem[0], mem[1]);
00114       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00115 
00116       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00117       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00118 
00119       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00120       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00121 
00122       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00123       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00124    }
00125    /* Put memory back in its place */
00126    _mm_storeu_ps(_mem, mem[0]);
00127    _mm_storeu_ps(_mem+4, mem[1]);
00128 }
00129 
00130 
00131 
00132 void filter_mem2(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00133 {
00134    if(ord==10)
00135       filter_mem2_10(x, _num, _den, y, N, ord, _mem);
00136    else if (ord==8)
00137       filter_mem2_8(x, _num, _den, y, N, ord, _mem);
00138 }
00139 
00140 
00141 
00142 void iir_mem2_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00143 {
00144    __m128 den[3], mem[3];
00145 
00146    int i;
00147 
00148    /* Copy numerator, denominator and memory to aligned xmm */
00149    for (i=0;i<2;i++)
00150    {
00151       mem[i] = _mm_loadu_ps(_mem+4*i);
00152       den[i] = _mm_loadu_ps(_den+4*i+1);
00153    }
00154    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00155    den[2] = _mm_setr_ps(_den[9], _den[10], 0, 0);
00156    
00157    for (i=0;i<N;i++)
00158    {
00159       __m128 xx;
00160       __m128 yy;
00161       /* Compute next filter result */
00162       xx = _mm_load_ps1(x+i);
00163       yy = _mm_add_ss(xx, mem[0]);
00164       _mm_store_ss(y+i, yy);
00165       yy = _mm_shuffle_ps(yy, yy, 0);
00166       
00167       /* Update memory */
00168       mem[0] = _mm_move_ss(mem[0], mem[1]);
00169       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00170 
00171       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00172 
00173       mem[1] = _mm_move_ss(mem[1], mem[2]);
00174       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00175 
00176       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00177 
00178       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00179 
00180       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00181    }
00182    /* Put memory back in its place */
00183    _mm_storeu_ps(_mem, mem[0]);
00184    _mm_storeu_ps(_mem+4, mem[1]);
00185    _mm_store_ss(_mem+8, mem[2]);
00186    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00187    _mm_store_ss(_mem+9, mem[2]);
00188 }
00189 
00190 
00191 void iir_mem2_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00192 {
00193    __m128 den[2], mem[2];
00194 
00195    int i;
00196 
00197    /* Copy numerator, denominator and memory to aligned xmm */
00198    for (i=0;i<2;i++)
00199    {
00200       mem[i] = _mm_loadu_ps(_mem+4*i);
00201       den[i] = _mm_loadu_ps(_den+4*i+1);
00202    }
00203    
00204    for (i=0;i<N;i++)
00205    {
00206       __m128 xx;
00207       __m128 yy;
00208       /* Compute next filter result */
00209       xx = _mm_load_ps1(x+i);
00210       yy = _mm_add_ss(xx, mem[0]);
00211       _mm_store_ss(y+i, yy);
00212       yy = _mm_shuffle_ps(yy, yy, 0);
00213       
00214       /* Update memory */
00215       mem[0] = _mm_move_ss(mem[0], mem[1]);
00216       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00217 
00218       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00219 
00220       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00221       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00222 
00223       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00224    }
00225    /* Put memory back in its place */
00226    _mm_storeu_ps(_mem, mem[0]);
00227    _mm_storeu_ps(_mem+4, mem[1]);
00228 }
00229 
00230 void iir_mem2(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00231 {
00232    if(ord==10)
00233       iir_mem2_10(x, _den, y, N, ord, _mem);
00234    else if (ord==8)
00235       iir_mem2_8(x, _den, y, N, ord, _mem);
00236 }
00237 
00238 
00239 void fir_mem2_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00240 {
00241    __m128 num[3], mem[3];
00242 
00243    int i;
00244 
00245    /* Copy numerator, denominator and memory to aligned xmm */
00246    for (i=0;i<2;i++)
00247    {
00248       mem[i] = _mm_loadu_ps(_mem+4*i);
00249       num[i] = _mm_loadu_ps(_num+4*i+1);
00250    }
00251    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00252    num[2] = _mm_setr_ps(_num[9], _num[10], 0, 0);
00253    
00254    for (i=0;i<N;i++)
00255    {
00256       __m128 xx;
00257       __m128 yy;
00258       /* Compute next filter result */
00259       xx = _mm_load_ps1(x+i);
00260       yy = _mm_add_ss(xx, mem[0]);
00261       _mm_store_ss(y+i, yy);
00262       yy = _mm_shuffle_ps(yy, yy, 0);
00263       
00264       /* Update memory */
00265       mem[0] = _mm_move_ss(mem[0], mem[1]);
00266       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00267 
00268       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00269 
00270       mem[1] = _mm_move_ss(mem[1], mem[2]);
00271       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00272 
00273       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00274 
00275       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00276 
00277       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00278    }
00279    /* Put memory back in its place */
00280    _mm_storeu_ps(_mem, mem[0]);
00281    _mm_storeu_ps(_mem+4, mem[1]);
00282    _mm_store_ss(_mem+8, mem[2]);
00283    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00284    _mm_store_ss(_mem+9, mem[2]);
00285 }
00286 
00287 void fir_mem2_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00288 {
00289    __m128 num[2], mem[2];
00290 
00291    int i;
00292 
00293    /* Copy numerator, denominator and memory to aligned xmm */
00294    for (i=0;i<2;i++)
00295    {
00296       mem[i] = _mm_loadu_ps(_mem+4*i);
00297       num[i] = _mm_loadu_ps(_num+4*i+1);
00298    }
00299    
00300    for (i=0;i<N;i++)
00301    {
00302       __m128 xx;
00303       __m128 yy;
00304       /* Compute next filter result */
00305       xx = _mm_load_ps1(x+i);
00306       yy = _mm_add_ss(xx, mem[0]);
00307       _mm_store_ss(y+i, yy);
00308       yy = _mm_shuffle_ps(yy, yy, 0);
00309       
00310       /* Update memory */
00311       mem[0] = _mm_move_ss(mem[0], mem[1]);
00312       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00313 
00314       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00315 
00316       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00317       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00318 
00319       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00320    }
00321    /* Put memory back in its place */
00322    _mm_storeu_ps(_mem, mem[0]);
00323    _mm_storeu_ps(_mem+4, mem[1]);
00324 }
00325 
00326 
00327 void fir_mem2(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00328 {
00329    if(ord==10)
00330       fir_mem2_10(x, _num, y, N, ord, _mem);
00331    else if (ord==8)
00332       fir_mem2_8(x, _num, y, N, ord, _mem);
00333 }

Generated on Tue May 17 12:46:54 2005 for speex by  doxygen 1.4.2