Libav 0.7.1
|
00001 /* 00002 * FFT/MDCT transform with Extended 3DNow! optimizations 00003 * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00022 #include "libavutil/x86_cpu.h" 00023 #include "libavcodec/dsputil.h" 00024 #include "fft.h" 00025 00026 DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; 00027 00028 #ifdef EMULATE_3DNOWEXT 00029 #define PSWAPD(s,d)\ 00030 "movq "#s","#d"\n"\ 00031 "psrlq $32,"#d"\n"\ 00032 "punpckldq "#s","#d"\n" 00033 #define ff_fft_calc_3dn2 ff_fft_calc_3dn 00034 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn 00035 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn 00036 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn 00037 #define ff_imdct_half_3dn2 ff_imdct_half_3dn 00038 #else 00039 #define PSWAPD(s,d) "pswapd "#s","#d"\n" 00040 #endif 00041 00042 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); 00043 void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); 00044 00045 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) 00046 { 00047 int n = 1<<s->nbits; 00048 int i; 00049 ff_fft_dispatch_interleave_3dn2(z, s->nbits); 00050 __asm__ volatile("femms"); 00051 if(n <= 8) 00052 for(i=0; i<n; i+=2) 00053 FFSWAP(FFTSample, z[i].im, z[i+1].re); 00054 } 00055 00056 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) 00057 { 00058 x86_reg j, k; 00059 long n = s->mdct_size; 00060 long n2 = n >> 1; 00061 long n4 = n >> 2; 00062 long n8 = n >> 3; 00063 const uint16_t *revtab = s->revtab; 00064 const FFTSample *tcos = s->tcos; 00065 const FFTSample *tsin = s->tsin; 00066 const FFTSample *in1, *in2; 00067 FFTComplex *z = (FFTComplex *)output; 00068 00069 /* pre rotation */ 00070 in1 = input; 00071 in2 = input + n2 - 1; 00072 #ifdef EMULATE_3DNOWEXT 00073 __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31)); 00074 #endif 00075 for(k = 0; k < n4; k++) { 00076 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it 00077 __asm__ volatile( 00078 "movd %0, %%mm0 \n" 00079 "movd %2, %%mm1 \n" 00080 "punpckldq %1, %%mm0 \n" 00081 "punpckldq %3, %%mm1 \n" 00082 "movq %%mm0, %%mm2 \n" 00083 PSWAPD( %%mm1, %%mm3 ) 00084 "pfmul %%mm1, %%mm0 \n" 00085 "pfmul %%mm3, %%mm2 \n" 00086 #ifdef EMULATE_3DNOWEXT 00087 "movq %%mm0, %%mm1 \n" 00088 "punpckhdq %%mm2, %%mm0 \n" 00089 "punpckldq %%mm2, %%mm1 \n" 00090 "pxor %%mm7, %%mm0 \n" 00091 "pfadd %%mm1, %%mm0 \n" 00092 #else 00093 "pfpnacc %%mm2, %%mm0 \n" 00094 #endif 00095 ::"m"(in2[-2*k]), "m"(in1[2*k]), 00096 "m"(tcos[k]), "m"(tsin[k]) 00097 ); 00098 __asm__ volatile( 00099 "movq %%mm0, %0 \n\t" 00100 :"=m"(z[revtab[k]]) 00101 ); 00102 } 00103 00104 ff_fft_dispatch_3dn2(z, s->nbits); 00105 00106 #define CMUL(j,mm0,mm1)\ 00107 "movq (%2,"#j",2), %%mm6 \n"\ 00108 "movq 8(%2,"#j",2), "#mm0"\n"\ 00109 "movq %%mm6, "#mm1"\n"\ 00110 "movq "#mm0",%%mm7 \n"\ 00111 "pfmul (%3,"#j"), %%mm6 \n"\ 00112 "pfmul (%4,"#j"), "#mm0"\n"\ 00113 "pfmul (%4,"#j"), "#mm1"\n"\ 00114 "pfmul (%3,"#j"), %%mm7 \n"\ 00115 "pfsub %%mm6, "#mm0"\n"\ 00116 "pfadd %%mm7, "#mm1"\n" 00117 00118 /* post rotation */ 00119 j = -n2; 00120 k = n2-8; 00121 __asm__ volatile( 00122 "1: \n" 00123 CMUL(%0, %%mm0, %%mm1) 00124 CMUL(%1, %%mm2, %%mm3) 00125 "movd %%mm0, (%2,%0,2) \n" 00126 "movd %%mm1,12(%2,%1,2) \n" 00127 "movd %%mm2, (%2,%1,2) \n" 00128 "movd %%mm3,12(%2,%0,2) \n" 00129 "psrlq $32, %%mm0 \n" 00130 "psrlq $32, %%mm1 \n" 00131 "psrlq $32, %%mm2 \n" 00132 "psrlq $32, %%mm3 \n" 00133 "movd %%mm0, 8(%2,%0,2) \n" 00134 "movd %%mm1, 4(%2,%1,2) \n" 00135 "movd %%mm2, 8(%2,%1,2) \n" 00136 "movd %%mm3, 4(%2,%0,2) \n" 00137 "sub $8, %1 \n" 00138 "add $8, %0 \n" 00139 "jl 1b \n" 00140 :"+r"(j), "+r"(k) 00141 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) 00142 :"memory" 00143 ); 00144 __asm__ volatile("femms"); 00145 } 00146 00147 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) 00148 { 00149 x86_reg j, k; 00150 long n = s->mdct_size; 00151 long n4 = n >> 2; 00152 00153 ff_imdct_half_3dn2(s, output+n4, input); 00154 00155 j = -n; 00156 k = n-8; 00157 __asm__ volatile( 00158 "movq %4, %%mm7 \n" 00159 "1: \n" 00160 PSWAPD((%2,%1), %%mm0) 00161 PSWAPD((%3,%0), %%mm1) 00162 "pxor %%mm7, %%mm0 \n" 00163 "movq %%mm1, (%3,%1) \n" 00164 "movq %%mm0, (%2,%0) \n" 00165 "sub $8, %1 \n" 00166 "add $8, %0 \n" 00167 "jl 1b \n" 00168 :"+r"(j), "+r"(k) 00169 :"r"(output+n4), "r"(output+n4*3), 00170 "m"(*m1m1) 00171 ); 00172 __asm__ volatile("femms"); 00173 } 00174