• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/vp3dsp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2004 the ffmpeg project
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "dsputil_mmx.h"
00029 
00030 extern const uint16_t ff_vp3_idct_data[];
00031 
00032 // this is off by one or two for some cases when filter_limit is greater than 63
00033 // in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
00034 // out: p1 in mm4, p2 in mm3
00035 #define VP3_LOOP_FILTER(flim) \
00036     "movq       %%mm6, %%mm7 \n\t" \
00037     "pand    "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
00038     "psrlw         $3, %%mm7 \n\t" \
00039     "pand    "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
00040     "movq       %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
00041     "pxor       %%mm4, %%mm2 \n\t" \
00042     "pand    "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
00043     "movq       %%mm2, %%mm5 \n\t" \
00044     "paddb      %%mm2, %%mm2 \n\t" \
00045     "paddb      %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
00046     "paddb      %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
00047     "pcmpeqb    %%mm0, %%mm0 \n\t" \
00048     "pxor       %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
00049     "pavgb      %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
00050     "pxor       %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
00051     "pavgb      %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
00052     "paddb   "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
00053     "pavgb      %%mm0, %%mm1 \n\t" /* 128+2+(   p2-p1  - p3) >> 2 */ \
00054     "pavgb      %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
00055     "paddusb    %%mm1, %%mm7 \n\t" /* d+128+1 */ \
00056     "movq    "MANGLE(ff_pb_81)", %%mm6 \n\t" \
00057     "psubusb    %%mm7, %%mm6 \n\t" \
00058     "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
00059 \
00060     "movq     "#flim", %%mm5 \n\t" \
00061     "pminub     %%mm5, %%mm6 \n\t" \
00062     "pminub     %%mm5, %%mm7 \n\t" \
00063     "movq       %%mm6, %%mm0 \n\t" \
00064     "movq       %%mm7, %%mm1 \n\t" \
00065     "paddb      %%mm6, %%mm6 \n\t" \
00066     "paddb      %%mm7, %%mm7 \n\t" \
00067     "pminub     %%mm5, %%mm6 \n\t" \
00068     "pminub     %%mm5, %%mm7 \n\t" \
00069     "psubb      %%mm0, %%mm6 \n\t" \
00070     "psubb      %%mm1, %%mm7 \n\t" \
00071     "paddusb    %%mm7, %%mm4 \n\t" \
00072     "psubusb    %%mm6, %%mm4 \n\t" \
00073     "psubusb    %%mm7, %%mm3 \n\t" \
00074     "paddusb    %%mm6, %%mm3 \n\t"
00075 
00076 #define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
00077     "movd "#mm", %0        \n\t" \
00078     "movw   %w0, -1"#dst0" \n\t" \
00079     "psrlq  $32, "#mm"     \n\t" \
00080     "shr    $16, %0        \n\t" \
00081     "movw   %w0, -1"#dst1" \n\t" \
00082     "movd "#mm", %0        \n\t" \
00083     "movw   %w0, -1"#dst2" \n\t" \
00084     "shr    $16, %0        \n\t" \
00085     "movw   %w0, -1"#dst3" \n\t"
00086 
00087 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
00088 {
00089     __asm__ volatile(
00090         "movq          %0, %%mm6 \n\t"
00091         "movq          %1, %%mm4 \n\t"
00092         "movq          %2, %%mm2 \n\t"
00093         "movq          %3, %%mm1 \n\t"
00094 
00095         VP3_LOOP_FILTER(%4)
00096 
00097         "movq       %%mm4, %1    \n\t"
00098         "movq       %%mm3, %2    \n\t"
00099 
00100         : "+m" (*(uint64_t*)(src - 2*stride)),
00101           "+m" (*(uint64_t*)(src - 1*stride)),
00102           "+m" (*(uint64_t*)(src + 0*stride)),
00103           "+m" (*(uint64_t*)(src + 1*stride))
00104         : "m"(*(uint64_t*)(bounding_values+129))
00105     );
00106 }
00107 
00108 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
00109 {
00110     x86_reg tmp;
00111 
00112     __asm__ volatile(
00113         "movd -2(%1),      %%mm6 \n\t"
00114         "movd -2(%1,%3),   %%mm0 \n\t"
00115         "movd -2(%1,%3,2), %%mm1 \n\t"
00116         "movd -2(%1,%4),   %%mm4 \n\t"
00117 
00118         TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
00119         VP3_LOOP_FILTER(%5)
00120         SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
00121 
00122         STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
00123         STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
00124 
00125         : "=&r"(tmp)
00126         : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
00127           "m"(*(uint64_t*)(bounding_values+129))
00128         : "memory"
00129     );
00130 }
00131 
00132 /* from original comments: The Macro does IDct on 4 1-D Dcts */
00133 #define BeginIDCT() \
00134     "movq   "I(3)", %%mm2 \n\t" \
00135     "movq   "C(3)", %%mm6 \n\t" \
00136     "movq    %%mm2, %%mm4 \n\t" \
00137     "movq   "J(5)", %%mm7 \n\t" \
00138     "pmulhw  %%mm6, %%mm4 \n\t"    /* r4 = c3*i3 - i3 */ \
00139     "movq   "C(5)", %%mm1 \n\t" \
00140     "pmulhw  %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 - i5 */ \
00141     "movq    %%mm1, %%mm5 \n\t" \
00142     "pmulhw  %%mm2, %%mm1 \n\t"    /* r1 = c5*i3 - i3 */ \
00143     "movq   "I(1)", %%mm3 \n\t" \
00144     "pmulhw  %%mm7, %%mm5 \n\t"    /* r5 = c5*i5 - i5 */ \
00145     "movq   "C(1)", %%mm0 \n\t" \
00146     "paddw   %%mm2, %%mm4 \n\t"    /* r4 = c3*i3 */ \
00147     "paddw   %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 */ \
00148     "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c5*i3 */ \
00149     "movq   "J(7)", %%mm1 \n\t" \
00150     "paddw   %%mm5, %%mm7 \n\t"    /* r7 = c5*i5 */ \
00151     "movq    %%mm0, %%mm5 \n\t"    /* r5 = c1 */ \
00152     "pmulhw  %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 - i1 */ \
00153     "paddsw  %%mm7, %%mm4 \n\t"    /* r4 = C = c3*i3 + c5*i5 */ \
00154     "pmulhw  %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 - i7 */ \
00155     "movq   "C(7)", %%mm7 \n\t" \
00156     "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = D = c3*i5 - c5*i3 */ \
00157     "paddw   %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 */ \
00158     "pmulhw  %%mm7, %%mm3 \n\t"    /* r3 = c7*i1 */ \
00159     "movq   "I(2)", %%mm2 \n\t" \
00160     "pmulhw  %%mm1, %%mm7 \n\t"    /* r7 = c7*i7 */ \
00161     "paddw   %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 */ \
00162     "movq    %%mm2, %%mm1 \n\t"    /* r1 = i2 */ \
00163     "pmulhw "C(2)", %%mm2 \n\t"    /* r2 = c2*i2 - i2 */ \
00164     "psubsw  %%mm5, %%mm3 \n\t"    /* r3 = B = c7*i1 - c1*i7 */ \
00165     "movq   "J(6)", %%mm5 \n\t" \
00166     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = A = c1*i1 + c7*i7 */ \
00167     "movq    %%mm5, %%mm7 \n\t"    /* r7 = i6 */ \
00168     "psubsw  %%mm4, %%mm0 \n\t"    /* r0 = A - C */ \
00169     "pmulhw "C(2)", %%mm5 \n\t"    /* r5 = c2*i6 - i6 */ \
00170     "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c2*i2 */ \
00171     "pmulhw "C(6)", %%mm1 \n\t"    /* r1 = c6*i2 */ \
00172     "paddsw  %%mm4, %%mm4 \n\t"    /* r4 = C + C */ \
00173     "paddsw  %%mm0, %%mm4 \n\t"    /* r4 = C. = A + C */ \
00174     "psubsw  %%mm6, %%mm3 \n\t"    /* r3 = B - D */ \
00175     "paddw   %%mm7, %%mm5 \n\t"    /* r5 = c2*i6 */ \
00176     "paddsw  %%mm6, %%mm6 \n\t"    /* r6 = D + D */ \
00177     "pmulhw "C(6)", %%mm7 \n\t"    /* r7 = c6*i6 */ \
00178     "paddsw  %%mm3, %%mm6 \n\t"    /* r6 = D. = B + D */ \
00179     "movq    %%mm4, "I(1)"\n\t"    /* save C. at I(1) */ \
00180     "psubsw  %%mm5, %%mm1 \n\t"    /* r1 = H = c6*i2 - c2*i6 */ \
00181     "movq   "C(4)", %%mm4 \n\t" \
00182     "movq    %%mm3, %%mm5 \n\t"    /* r5 = B - D */ \
00183     "pmulhw  %%mm4, %%mm3 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
00184     "paddsw  %%mm2, %%mm7 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
00185     "movq    %%mm6, "I(2)"\n\t"    /* save D. at I(2) */ \
00186     "movq    %%mm0, %%mm2 \n\t"    /* r2 = A - C */ \
00187     "movq   "I(0)", %%mm6 \n\t" \
00188     "pmulhw  %%mm4, %%mm0 \n\t"    /* r0 = (c4 - 1) * (A - C) */ \
00189     "paddw   %%mm3, %%mm5 \n\t"    /* r5 = B. = c4 * (B - D) */ \
00190     "movq   "J(4)", %%mm3 \n\t" \
00191     "psubsw  %%mm1, %%mm5 \n\t"    /* r5 = B.. = B. - H */ \
00192     "paddw   %%mm0, %%mm2 \n\t"    /* r0 = A. = c4 * (A - C) */ \
00193     "psubsw  %%mm3, %%mm6 \n\t"    /* r6 = i0 - i4 */ \
00194     "movq    %%mm6, %%mm0 \n\t" \
00195     "pmulhw  %%mm4, %%mm6 \n\t"    /* r6 = (c4 - 1) * (i0 - i4) */ \
00196     "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = i4 + i4 */ \
00197     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H + H */ \
00198     "paddsw  %%mm0, %%mm3 \n\t"    /* r3 = i0 + i4 */ \
00199     "paddsw  %%mm5, %%mm1 \n\t"    /* r1 = H. = B + H */ \
00200     "pmulhw  %%mm3, %%mm4 \n\t"    /* r4 = (c4 - 1) * (i0 + i4) */ \
00201     "paddsw  %%mm0, %%mm6 \n\t"    /* r6 = F = c4 * (i0 - i4) */ \
00202     "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = F. = F - A. */ \
00203     "paddsw  %%mm2, %%mm2 \n\t"    /* r2 = A. + A. */ \
00204     "movq   "I(1)", %%mm0 \n\t"    /* r0 = C. */ \
00205     "paddsw  %%mm6, %%mm2 \n\t"    /* r2 = A.. = F + A. */ \
00206     "paddw   %%mm3, %%mm4 \n\t"    /* r4 = E = c4 * (i0 + i4) */ \
00207     "psubsw  %%mm1, %%mm2 \n\t"    /* r2 = R2 = A.. - H. */
00208 
00209 /* RowIDCT gets ready to transpose */
00210 #define RowIDCT() \
00211     BeginIDCT() \
00212     "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
00213     "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
00214     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
00215     "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
00216     "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
00217     "paddsw  %%mm4, %%mm7 \n\t"    /* r1 = R1 = A.. + H. */ \
00218     "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
00219     "paddsw  %%mm3, %%mm3 \n\t" \
00220     "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
00221     "paddsw  %%mm5, %%mm5 \n\t" \
00222     "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
00223     "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
00224     "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
00225     "paddsw  %%mm0, %%mm0 \n\t" \
00226     "movq    %%mm1, "I(1)"\n\t"    /* save R1 */ \
00227     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */
00228 
00229 /* Column IDCT normalizes and stores final results */
00230 #define ColumnIDCT() \
00231     BeginIDCT() \
00232     "paddsw "OC_8", %%mm2 \n\t"    /* adjust R2 (and R1) for shift */ \
00233     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
00234     "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
00235     "psraw      $4, %%mm2 \n\t"    /* r2 = NR2 */ \
00236     "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
00237     "psraw      $4, %%mm1 \n\t"    /* r1 = NR1 */ \
00238     "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
00239     "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
00240     "movq    %%mm2, "I(2)"\n\t"    /* store NR2 at I2 */ \
00241     "paddsw  %%mm4, %%mm7 \n\t"    /* r7 = G. = E + G */ \
00242     "movq    %%mm1, "I(1)"\n\t"    /* store NR1 at I1 */ \
00243     "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
00244     "paddsw "OC_8", %%mm4 \n\t"    /* adjust R4 (and R3) for shift */ \
00245     "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = D. + D. */ \
00246     "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
00247     "psraw      $4, %%mm4 \n\t"    /* r4 = NR4 */ \
00248     "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
00249     "psraw      $4, %%mm3 \n\t"    /* r3 = NR3 */ \
00250     "paddsw "OC_8", %%mm6 \n\t"    /* adjust R6 (and R5) for shift */ \
00251     "paddsw  %%mm5, %%mm5 \n\t"    /* r5 = B.. + B.. */ \
00252     "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
00253     "psraw      $4, %%mm6 \n\t"    /* r6 = NR6 */ \
00254     "movq    %%mm4, "J(4)"\n\t"    /* store NR4 at J4 */ \
00255     "psraw      $4, %%mm5 \n\t"    /* r5 = NR5 */ \
00256     "movq    %%mm3, "I(3)"\n\t"    /* store NR3 at I3 */ \
00257     "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
00258     "paddsw "OC_8", %%mm7 \n\t"    /* adjust R7 (and R0) for shift */ \
00259     "paddsw  %%mm0, %%mm0 \n\t"    /* r0 = C. + C. */ \
00260     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */ \
00261     "psraw      $4, %%mm7 \n\t"    /* r7 = NR7 */ \
00262     "movq    %%mm6, "J(6)"\n\t"    /* store NR6 at J6 */ \
00263     "psraw      $4, %%mm0 \n\t"    /* r0 = NR0 */ \
00264     "movq    %%mm5, "J(5)"\n\t"    /* store NR5 at J5 */ \
00265     "movq    %%mm7, "J(7)"\n\t"    /* store NR7 at J7 */ \
00266     "movq    %%mm0, "I(0)"\n\t"    /* store NR0 at I0 */
00267 
00268 /* Following macro does two 4x4 transposes in place.
00269 
00270   At entry (we assume):
00271 
00272     r0 = a3 a2 a1 a0
00273     I(1) = b3 b2 b1 b0
00274     r2 = c3 c2 c1 c0
00275     r3 = d3 d2 d1 d0
00276 
00277     r4 = e3 e2 e1 e0
00278     r5 = f3 f2 f1 f0
00279     r6 = g3 g2 g1 g0
00280     r7 = h3 h2 h1 h0
00281 
00282   At exit, we have:
00283 
00284     I(0) = d0 c0 b0 a0
00285     I(1) = d1 c1 b1 a1
00286     I(2) = d2 c2 b2 a2
00287     I(3) = d3 c3 b3 a3
00288 
00289     J(4) = h0 g0 f0 e0
00290     J(5) = h1 g1 f1 e1
00291     J(6) = h2 g2 f2 e2
00292     J(7) = h3 g3 f3 e3
00293 
00294    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
00295    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
00296 
00297    Since r1 is free at entry, we calculate the Js first. */
00298 #define Transpose() \
00299     "movq       %%mm4, %%mm1 \n\t"    /* r1 = e3 e2 e1 e0 */ \
00300     "punpcklwd  %%mm5, %%mm4 \n\t"    /* r4 = f1 e1 f0 e0 */ \
00301     "movq       %%mm0, "I(0)"\n\t"    /* save a3 a2 a1 a0 */ \
00302     "punpckhwd  %%mm5, %%mm1 \n\t"    /* r1 = f3 e3 f2 e2 */ \
00303     "movq       %%mm6, %%mm0 \n\t"    /* r0 = g3 g2 g1 g0 */ \
00304     "punpcklwd  %%mm7, %%mm6 \n\t"    /* r6 = h1 g1 h0 g0 */ \
00305     "movq       %%mm4, %%mm5 \n\t"    /* r5 = f1 e1 f0 e0 */ \
00306     "punpckldq  %%mm6, %%mm4 \n\t"    /* r4 = h0 g0 f0 e0 = R4 */ \
00307     "punpckhdq  %%mm6, %%mm5 \n\t"    /* r5 = h1 g1 f1 e1 = R5 */ \
00308     "movq       %%mm1, %%mm6 \n\t"    /* r6 = f3 e3 f2 e2 */ \
00309     "movq       %%mm4, "J(4)"\n\t" \
00310     "punpckhwd  %%mm7, %%mm0 \n\t"    /* r0 = h3 g3 h2 g2 */ \
00311     "movq       %%mm5, "J(5)"\n\t" \
00312     "punpckhdq  %%mm0, %%mm6 \n\t"    /* r6 = h3 g3 f3 e3 = R7 */ \
00313     "movq      "I(0)", %%mm4 \n\t"    /* r4 = a3 a2 a1 a0 */ \
00314     "punpckldq  %%mm0, %%mm1 \n\t"    /* r1 = h2 g2 f2 e2 = R6 */ \
00315     "movq      "I(1)", %%mm5 \n\t"    /* r5 = b3 b2 b1 b0 */ \
00316     "movq       %%mm4, %%mm0 \n\t"    /* r0 = a3 a2 a1 a0 */ \
00317     "movq       %%mm6, "J(7)"\n\t" \
00318     "punpcklwd  %%mm5, %%mm0 \n\t"    /* r0 = b1 a1 b0 a0 */ \
00319     "movq       %%mm1, "J(6)"\n\t" \
00320     "punpckhwd  %%mm5, %%mm4 \n\t"    /* r4 = b3 a3 b2 a2 */ \
00321     "movq       %%mm2, %%mm5 \n\t"    /* r5 = c3 c2 c1 c0 */ \
00322     "punpcklwd  %%mm3, %%mm2 \n\t"    /* r2 = d1 c1 d0 c0 */ \
00323     "movq       %%mm0, %%mm1 \n\t"    /* r1 = b1 a1 b0 a0 */ \
00324     "punpckldq  %%mm2, %%mm0 \n\t"    /* r0 = d0 c0 b0 a0 = R0 */ \
00325     "punpckhdq  %%mm2, %%mm1 \n\t"    /* r1 = d1 c1 b1 a1 = R1 */ \
00326     "movq       %%mm4, %%mm2 \n\t"    /* r2 = b3 a3 b2 a2 */ \
00327     "movq       %%mm0, "I(0)"\n\t" \
00328     "punpckhwd  %%mm3, %%mm5 \n\t"    /* r5 = d3 c3 d2 c2 */ \
00329     "movq       %%mm1, "I(1)"\n\t" \
00330     "punpckhdq  %%mm5, %%mm4 \n\t"    /* r4 = d3 c3 b3 a3 = R3 */ \
00331     "punpckldq  %%mm5, %%mm2 \n\t"    /* r2 = d2 c2 b2 a2 = R2 */ \
00332     "movq       %%mm4, "I(3)"\n\t" \
00333     "movq       %%mm2, "I(2)"\n\t"
00334 
00335 void ff_vp3_idct_mmx(int16_t *output_data)
00336 {
00337     /* eax = quantized input
00338      * ebx = dequantizer matrix
00339      * ecx = IDCT constants
00340      *  M(I) = ecx + MaskOffset(0) + I * 8
00341      *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
00342      * edx = output
00343      * r0..r7 = mm0..mm7
00344      */
00345 
00346 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
00347 #define OC_8 "%2"
00348 
00349     /* at this point, function has completed dequantization + dezigzag +
00350      * partial transposition; now do the idct itself */
00351 #define I(x) AV_STRINGIFY(16* x       )"(%0)"
00352 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
00353 
00354     __asm__ volatile (
00355         RowIDCT()
00356         Transpose()
00357 
00358 #undef I
00359 #undef J
00360 #define I(x) AV_STRINGIFY(16* x    + 64)"(%0)"
00361 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
00362 
00363         RowIDCT()
00364         Transpose()
00365 
00366 #undef I
00367 #undef J
00368 #define I(x) AV_STRINGIFY(16*x)"(%0)"
00369 #define J(x) AV_STRINGIFY(16*x)"(%0)"
00370 
00371         ColumnIDCT()
00372 
00373 #undef I
00374 #undef J
00375 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
00376 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
00377 
00378         ColumnIDCT()
00379         :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
00380     );
00381 #undef I
00382 #undef J
00383 
00384 }
00385 
00386 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00387 {
00388     ff_vp3_idct_mmx(block);
00389     put_signed_pixels_clamped_mmx(block, dest, line_size);
00390 }
00391 
00392 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00393 {
00394     ff_vp3_idct_mmx(block);
00395     add_pixels_clamped_mmx(block, dest, line_size);
00396 }

Generated on Tue Nov 4 2014 12:59:23 for ffmpeg by  doxygen 1.7.1