Libav 0.7.1
|
00001 /* 00002 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> 00003 * 00004 * This file is part of Libav. 00005 * 00006 * Libav is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * Libav is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with Libav; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #undef REAL_MOVNTQ 00022 #undef MOVNTQ 00023 #undef PREFETCH 00024 00025 #if COMPILE_TEMPLATE_MMX2 00026 #define PREFETCH "prefetchnta" 00027 #else 00028 #define PREFETCH " # nop" 00029 #endif 00030 00031 #if COMPILE_TEMPLATE_MMX2 00032 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 00033 #else 00034 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 00035 #endif 00036 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 00037 00038 #define YSCALEYUV2YV12X(offset, dest, end, pos) \ 00039 __asm__ volatile(\ 00040 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00041 "movq %%mm3, %%mm4 \n\t"\ 00042 "lea " offset "(%0), %%"REG_d" \n\t"\ 00043 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00044 ".p2align 4 \n\t" /* FIXME Unroll? */\ 00045 "1: \n\t"\ 00046 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 00047 "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\ 00048 "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\ 00049 "add $16, %%"REG_d" \n\t"\ 00050 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00051 "test %%"REG_S", %%"REG_S" \n\t"\ 00052 "pmulhw %%mm0, %%mm2 \n\t"\ 00053 "pmulhw %%mm0, %%mm5 \n\t"\ 00054 "paddw %%mm2, %%mm3 \n\t"\ 00055 "paddw %%mm5, %%mm4 \n\t"\ 00056 " jnz 1b \n\t"\ 00057 "psraw $3, %%mm3 \n\t"\ 00058 "psraw $3, %%mm4 \n\t"\ 00059 "packuswb %%mm4, %%mm3 \n\t"\ 00060 MOVNTQ(%%mm3, (%1, %3))\ 00061 "add $8, %3 \n\t"\ 00062 "cmp %2, %3 \n\t"\ 00063 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00064 "movq %%mm3, %%mm4 \n\t"\ 00065 "lea " offset "(%0), %%"REG_d" \n\t"\ 00066 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00067 "jb 1b \n\t"\ 00068 :: "r" (&c->redDither),\ 00069 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\ 00070 : "%"REG_d, "%"REG_S\ 00071 ); 00072 00073 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, 00074 const int16_t **lumSrc, int lumFilterSize, 00075 const int16_t *chrFilter, const int16_t **chrUSrc, 00076 const int16_t **chrVSrc, 00077 int chrFilterSize, const int16_t **alpSrc, 00078 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, 00079 uint8_t *aDest, int dstW, int chrDstW) 00080 { 00081 if (uDest) { 00082 x86_reg uv_off = c->uv_off; 00083 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) 00084 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) 00085 } 00086 if (CONFIG_SWSCALE_ALPHA && aDest) { 00087 YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) 00088 } 00089 00090 YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0) 00091 } 00092 00093 #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \ 00094 __asm__ volatile(\ 00095 "lea " offset "(%0), %%"REG_d" \n\t"\ 00096 "pxor %%mm4, %%mm4 \n\t"\ 00097 "pxor %%mm5, %%mm5 \n\t"\ 00098 "pxor %%mm6, %%mm6 \n\t"\ 00099 "pxor %%mm7, %%mm7 \n\t"\ 00100 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00101 ".p2align 4 \n\t"\ 00102 "1: \n\t"\ 00103 "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\ 00104 "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\ 00105 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00106 "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\ 00107 "movq %%mm0, %%mm3 \n\t"\ 00108 "punpcklwd %%mm1, %%mm0 \n\t"\ 00109 "punpckhwd %%mm1, %%mm3 \n\t"\ 00110 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ 00111 "pmaddwd %%mm1, %%mm0 \n\t"\ 00112 "pmaddwd %%mm1, %%mm3 \n\t"\ 00113 "paddd %%mm0, %%mm4 \n\t"\ 00114 "paddd %%mm3, %%mm5 \n\t"\ 00115 "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\ 00116 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00117 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00118 "test %%"REG_S", %%"REG_S" \n\t"\ 00119 "movq %%mm2, %%mm0 \n\t"\ 00120 "punpcklwd %%mm3, %%mm2 \n\t"\ 00121 "punpckhwd %%mm3, %%mm0 \n\t"\ 00122 "pmaddwd %%mm1, %%mm2 \n\t"\ 00123 "pmaddwd %%mm1, %%mm0 \n\t"\ 00124 "paddd %%mm2, %%mm6 \n\t"\ 00125 "paddd %%mm0, %%mm7 \n\t"\ 00126 " jnz 1b \n\t"\ 00127 "psrad $16, %%mm4 \n\t"\ 00128 "psrad $16, %%mm5 \n\t"\ 00129 "psrad $16, %%mm6 \n\t"\ 00130 "psrad $16, %%mm7 \n\t"\ 00131 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00132 "packssdw %%mm5, %%mm4 \n\t"\ 00133 "packssdw %%mm7, %%mm6 \n\t"\ 00134 "paddw %%mm0, %%mm4 \n\t"\ 00135 "paddw %%mm0, %%mm6 \n\t"\ 00136 "psraw $3, %%mm4 \n\t"\ 00137 "psraw $3, %%mm6 \n\t"\ 00138 "packuswb %%mm6, %%mm4 \n\t"\ 00139 MOVNTQ(%%mm4, (%1, %3))\ 00140 "add $8, %3 \n\t"\ 00141 "cmp %2, %3 \n\t"\ 00142 "lea " offset "(%0), %%"REG_d" \n\t"\ 00143 "pxor %%mm4, %%mm4 \n\t"\ 00144 "pxor %%mm5, %%mm5 \n\t"\ 00145 "pxor %%mm6, %%mm6 \n\t"\ 00146 "pxor %%mm7, %%mm7 \n\t"\ 00147 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00148 "jb 1b \n\t"\ 00149 :: "r" (&c->redDither),\ 00150 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\ 00151 : "%"REG_a, "%"REG_d, "%"REG_S\ 00152 ); 00153 00154 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, 00155 const int16_t **lumSrc, int lumFilterSize, 00156 const int16_t *chrFilter, const int16_t **chrUSrc, 00157 const int16_t **chrVSrc, 00158 int chrFilterSize, const int16_t **alpSrc, 00159 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, 00160 uint8_t *aDest, int dstW, int chrDstW) 00161 { 00162 if (uDest) { 00163 x86_reg uv_off = c->uv_off; 00164 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) 00165 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) 00166 } 00167 if (CONFIG_SWSCALE_ALPHA && aDest) { 00168 YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) 00169 } 00170 00171 YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0) 00172 } 00173 00174 static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, 00175 const int16_t *chrUSrc, const int16_t *chrVSrc, 00176 const int16_t *alpSrc, 00177 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, 00178 uint8_t *aDest, int dstW, int chrDstW) 00179 { 00180 int p= 4; 00181 const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW }; 00182 uint8_t *dst[4]= { aDest, dest, uDest, vDest }; 00183 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW }; 00184 00185 while (p--) { 00186 if (dst[p]) { 00187 __asm__ volatile( 00188 "mov %2, %%"REG_a" \n\t" 00189 ".p2align 4 \n\t" /* FIXME Unroll? */ 00190 "1: \n\t" 00191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" 00192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" 00193 "psraw $7, %%mm0 \n\t" 00194 "psraw $7, %%mm1 \n\t" 00195 "packuswb %%mm1, %%mm0 \n\t" 00196 MOVNTQ(%%mm0, (%1, %%REGa)) 00197 "add $8, %%"REG_a" \n\t" 00198 "jnc 1b \n\t" 00199 :: "r" (src[p]), "r" (dst[p] + counter[p]), 00200 "g" (-counter[p]) 00201 : "%"REG_a 00202 ); 00203 } 00204 } 00205 } 00206 00207 static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, 00208 const int16_t *chrUSrc, const int16_t *chrVSrc, 00209 const int16_t *alpSrc, 00210 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, 00211 uint8_t *aDest, int dstW, int chrDstW) 00212 { 00213 int p= 4; 00214 const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW }; 00215 uint8_t *dst[4]= { aDest, dest, uDest, vDest }; 00216 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW }; 00217 00218 while (p--) { 00219 if (dst[p]) { 00220 __asm__ volatile( 00221 "mov %2, %%"REG_a" \n\t" 00222 "pcmpeqw %%mm7, %%mm7 \n\t" 00223 "psrlw $15, %%mm7 \n\t" 00224 "psllw $6, %%mm7 \n\t" 00225 ".p2align 4 \n\t" /* FIXME Unroll? */ 00226 "1: \n\t" 00227 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" 00228 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" 00229 "paddsw %%mm7, %%mm0 \n\t" 00230 "paddsw %%mm7, %%mm1 \n\t" 00231 "psraw $7, %%mm0 \n\t" 00232 "psraw $7, %%mm1 \n\t" 00233 "packuswb %%mm1, %%mm0 \n\t" 00234 MOVNTQ(%%mm0, (%1, %%REGa)) 00235 "add $8, %%"REG_a" \n\t" 00236 "jnc 1b \n\t" 00237 :: "r" (src[p]), "r" (dst[p] + counter[p]), 00238 "g" (-counter[p]) 00239 : "%"REG_a 00240 ); 00241 } 00242 } 00243 } 00244 00245 #define YSCALEYUV2PACKEDX_UV \ 00246 __asm__ volatile(\ 00247 "xor %%"REG_a", %%"REG_a" \n\t"\ 00248 ".p2align 4 \n\t"\ 00249 "nop \n\t"\ 00250 "1: \n\t"\ 00251 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 00252 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00253 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00254 "movq %%mm3, %%mm4 \n\t"\ 00255 ".p2align 4 \n\t"\ 00256 "2: \n\t"\ 00257 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 00258 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ 00259 "add %6, %%"REG_S" \n\t" \ 00260 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ 00261 "add $16, %%"REG_d" \n\t"\ 00262 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00263 "pmulhw %%mm0, %%mm2 \n\t"\ 00264 "pmulhw %%mm0, %%mm5 \n\t"\ 00265 "paddw %%mm2, %%mm3 \n\t"\ 00266 "paddw %%mm5, %%mm4 \n\t"\ 00267 "test %%"REG_S", %%"REG_S" \n\t"\ 00268 " jnz 2b \n\t"\ 00269 00270 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 00271 "lea "offset"(%0), %%"REG_d" \n\t"\ 00272 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00273 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 00274 "movq "#dst1", "#dst2" \n\t"\ 00275 ".p2align 4 \n\t"\ 00276 "2: \n\t"\ 00277 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ 00278 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 00279 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 00280 "add $16, %%"REG_d" \n\t"\ 00281 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00282 "pmulhw "#coeff", "#src1" \n\t"\ 00283 "pmulhw "#coeff", "#src2" \n\t"\ 00284 "paddw "#src1", "#dst1" \n\t"\ 00285 "paddw "#src2", "#dst2" \n\t"\ 00286 "test %%"REG_S", %%"REG_S" \n\t"\ 00287 " jnz 2b \n\t"\ 00288 00289 #define YSCALEYUV2PACKEDX \ 00290 YSCALEYUV2PACKEDX_UV \ 00291 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 00292 00293 #define YSCALEYUV2PACKEDX_END \ 00294 :: "r" (&c->redDither), \ 00295 "m" (dummy), "m" (dummy), "m" (dummy),\ 00296 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 00297 : "%"REG_a, "%"REG_d, "%"REG_S \ 00298 ); 00299 00300 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 00301 __asm__ volatile(\ 00302 "xor %%"REG_a", %%"REG_a" \n\t"\ 00303 ".p2align 4 \n\t"\ 00304 "nop \n\t"\ 00305 "1: \n\t"\ 00306 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 00307 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00308 "pxor %%mm4, %%mm4 \n\t"\ 00309 "pxor %%mm5, %%mm5 \n\t"\ 00310 "pxor %%mm6, %%mm6 \n\t"\ 00311 "pxor %%mm7, %%mm7 \n\t"\ 00312 ".p2align 4 \n\t"\ 00313 "2: \n\t"\ 00314 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ 00315 "add %6, %%"REG_S" \n\t" \ 00316 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ 00317 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00318 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ 00319 "movq %%mm0, %%mm3 \n\t"\ 00320 "punpcklwd %%mm1, %%mm0 \n\t"\ 00321 "punpckhwd %%mm1, %%mm3 \n\t"\ 00322 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ 00323 "pmaddwd %%mm1, %%mm0 \n\t"\ 00324 "pmaddwd %%mm1, %%mm3 \n\t"\ 00325 "paddd %%mm0, %%mm4 \n\t"\ 00326 "paddd %%mm3, %%mm5 \n\t"\ 00327 "add %6, %%"REG_S" \n\t" \ 00328 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ 00329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00331 "test %%"REG_S", %%"REG_S" \n\t"\ 00332 "movq %%mm2, %%mm0 \n\t"\ 00333 "punpcklwd %%mm3, %%mm2 \n\t"\ 00334 "punpckhwd %%mm3, %%mm0 \n\t"\ 00335 "pmaddwd %%mm1, %%mm2 \n\t"\ 00336 "pmaddwd %%mm1, %%mm0 \n\t"\ 00337 "paddd %%mm2, %%mm6 \n\t"\ 00338 "paddd %%mm0, %%mm7 \n\t"\ 00339 " jnz 2b \n\t"\ 00340 "psrad $16, %%mm4 \n\t"\ 00341 "psrad $16, %%mm5 \n\t"\ 00342 "psrad $16, %%mm6 \n\t"\ 00343 "psrad $16, %%mm7 \n\t"\ 00344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00345 "packssdw %%mm5, %%mm4 \n\t"\ 00346 "packssdw %%mm7, %%mm6 \n\t"\ 00347 "paddw %%mm0, %%mm4 \n\t"\ 00348 "paddw %%mm0, %%mm6 \n\t"\ 00349 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 00350 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 00351 00352 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 00353 "lea "offset"(%0), %%"REG_d" \n\t"\ 00354 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00355 "pxor %%mm1, %%mm1 \n\t"\ 00356 "pxor %%mm5, %%mm5 \n\t"\ 00357 "pxor %%mm7, %%mm7 \n\t"\ 00358 "pxor %%mm6, %%mm6 \n\t"\ 00359 ".p2align 4 \n\t"\ 00360 "2: \n\t"\ 00361 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 00362 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 00363 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00364 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 00365 "movq %%mm0, %%mm3 \n\t"\ 00366 "punpcklwd %%mm4, %%mm0 \n\t"\ 00367 "punpckhwd %%mm4, %%mm3 \n\t"\ 00368 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ 00369 "pmaddwd %%mm4, %%mm0 \n\t"\ 00370 "pmaddwd %%mm4, %%mm3 \n\t"\ 00371 "paddd %%mm0, %%mm1 \n\t"\ 00372 "paddd %%mm3, %%mm5 \n\t"\ 00373 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 00374 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00375 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00376 "test %%"REG_S", %%"REG_S" \n\t"\ 00377 "movq %%mm2, %%mm0 \n\t"\ 00378 "punpcklwd %%mm3, %%mm2 \n\t"\ 00379 "punpckhwd %%mm3, %%mm0 \n\t"\ 00380 "pmaddwd %%mm4, %%mm2 \n\t"\ 00381 "pmaddwd %%mm4, %%mm0 \n\t"\ 00382 "paddd %%mm2, %%mm7 \n\t"\ 00383 "paddd %%mm0, %%mm6 \n\t"\ 00384 " jnz 2b \n\t"\ 00385 "psrad $16, %%mm1 \n\t"\ 00386 "psrad $16, %%mm5 \n\t"\ 00387 "psrad $16, %%mm7 \n\t"\ 00388 "psrad $16, %%mm6 \n\t"\ 00389 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00390 "packssdw %%mm5, %%mm1 \n\t"\ 00391 "packssdw %%mm6, %%mm7 \n\t"\ 00392 "paddw %%mm0, %%mm1 \n\t"\ 00393 "paddw %%mm0, %%mm7 \n\t"\ 00394 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 00395 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 00396 00397 #define YSCALEYUV2PACKEDX_ACCURATE \ 00398 YSCALEYUV2PACKEDX_ACCURATE_UV \ 00399 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 00400 00401 #define YSCALEYUV2RGBX \ 00402 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 00403 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 00404 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00405 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00406 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 00407 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 00408 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00409 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 00410 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 00411 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 00412 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 00413 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 00414 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 00415 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00416 "paddw %%mm3, %%mm4 \n\t"\ 00417 "movq %%mm2, %%mm0 \n\t"\ 00418 "movq %%mm5, %%mm6 \n\t"\ 00419 "movq %%mm4, %%mm3 \n\t"\ 00420 "punpcklwd %%mm2, %%mm2 \n\t"\ 00421 "punpcklwd %%mm5, %%mm5 \n\t"\ 00422 "punpcklwd %%mm4, %%mm4 \n\t"\ 00423 "paddw %%mm1, %%mm2 \n\t"\ 00424 "paddw %%mm1, %%mm5 \n\t"\ 00425 "paddw %%mm1, %%mm4 \n\t"\ 00426 "punpckhwd %%mm0, %%mm0 \n\t"\ 00427 "punpckhwd %%mm6, %%mm6 \n\t"\ 00428 "punpckhwd %%mm3, %%mm3 \n\t"\ 00429 "paddw %%mm7, %%mm0 \n\t"\ 00430 "paddw %%mm7, %%mm6 \n\t"\ 00431 "paddw %%mm7, %%mm3 \n\t"\ 00432 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00433 "packuswb %%mm0, %%mm2 \n\t"\ 00434 "packuswb %%mm6, %%mm5 \n\t"\ 00435 "packuswb %%mm3, %%mm4 \n\t"\ 00436 00437 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 00438 "movq "#b", "#q2" \n\t" /* B */\ 00439 "movq "#r", "#t" \n\t" /* R */\ 00440 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 00441 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 00442 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 00443 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 00444 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 00445 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 00446 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 00447 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 00448 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 00449 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 00450 \ 00451 MOVNTQ( q0, (dst, index, 4))\ 00452 MOVNTQ( b, 8(dst, index, 4))\ 00453 MOVNTQ( q2, 16(dst, index, 4))\ 00454 MOVNTQ( q3, 24(dst, index, 4))\ 00455 \ 00456 "add $8, "#index" \n\t"\ 00457 "cmp "#dstw", "#index" \n\t"\ 00458 " jb 1b \n\t" 00459 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 00460 00461 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, 00462 const int16_t **lumSrc, int lumFilterSize, 00463 const int16_t *chrFilter, const int16_t **chrUSrc, 00464 const int16_t **chrVSrc, 00465 int chrFilterSize, const int16_t **alpSrc, 00466 uint8_t *dest, int dstW, int dstY) 00467 { 00468 x86_reg dummy=0; 00469 x86_reg dstW_reg = dstW; 00470 x86_reg uv_off = c->uv_off << 1; 00471 00472 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 00473 YSCALEYUV2PACKEDX_ACCURATE 00474 YSCALEYUV2RGBX 00475 "movq %%mm2, "U_TEMP"(%0) \n\t" 00476 "movq %%mm4, "V_TEMP"(%0) \n\t" 00477 "movq %%mm5, "Y_TEMP"(%0) \n\t" 00478 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 00479 "movq "Y_TEMP"(%0), %%mm5 \n\t" 00480 "psraw $3, %%mm1 \n\t" 00481 "psraw $3, %%mm7 \n\t" 00482 "packuswb %%mm7, %%mm1 \n\t" 00483 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 00484 YSCALEYUV2PACKEDX_END 00485 } else { 00486 YSCALEYUV2PACKEDX_ACCURATE 00487 YSCALEYUV2RGBX 00488 "pcmpeqd %%mm7, %%mm7 \n\t" 00489 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 00490 YSCALEYUV2PACKEDX_END 00491 } 00492 } 00493 00494 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, 00495 const int16_t **lumSrc, int lumFilterSize, 00496 const int16_t *chrFilter, const int16_t **chrUSrc, 00497 const int16_t **chrVSrc, 00498 int chrFilterSize, const int16_t **alpSrc, 00499 uint8_t *dest, int dstW, int dstY) 00500 { 00501 x86_reg dummy=0; 00502 x86_reg dstW_reg = dstW; 00503 x86_reg uv_off = c->uv_off << 1; 00504 00505 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 00506 YSCALEYUV2PACKEDX 00507 YSCALEYUV2RGBX 00508 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 00509 "psraw $3, %%mm1 \n\t" 00510 "psraw $3, %%mm7 \n\t" 00511 "packuswb %%mm7, %%mm1 \n\t" 00512 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 00513 YSCALEYUV2PACKEDX_END 00514 } else { 00515 YSCALEYUV2PACKEDX 00516 YSCALEYUV2RGBX 00517 "pcmpeqd %%mm7, %%mm7 \n\t" 00518 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 00519 YSCALEYUV2PACKEDX_END 00520 } 00521 } 00522 00523 #define REAL_WRITERGB16(dst, dstw, index) \ 00524 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 00525 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 00526 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 00527 "psrlq $3, %%mm2 \n\t"\ 00528 \ 00529 "movq %%mm2, %%mm1 \n\t"\ 00530 "movq %%mm4, %%mm3 \n\t"\ 00531 \ 00532 "punpcklbw %%mm7, %%mm3 \n\t"\ 00533 "punpcklbw %%mm5, %%mm2 \n\t"\ 00534 "punpckhbw %%mm7, %%mm4 \n\t"\ 00535 "punpckhbw %%mm5, %%mm1 \n\t"\ 00536 \ 00537 "psllq $3, %%mm3 \n\t"\ 00538 "psllq $3, %%mm4 \n\t"\ 00539 \ 00540 "por %%mm3, %%mm2 \n\t"\ 00541 "por %%mm4, %%mm1 \n\t"\ 00542 \ 00543 MOVNTQ(%%mm2, (dst, index, 2))\ 00544 MOVNTQ(%%mm1, 8(dst, index, 2))\ 00545 \ 00546 "add $8, "#index" \n\t"\ 00547 "cmp "#dstw", "#index" \n\t"\ 00548 " jb 1b \n\t" 00549 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 00550 00551 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, 00552 const int16_t **lumSrc, int lumFilterSize, 00553 const int16_t *chrFilter, const int16_t **chrUSrc, 00554 const int16_t **chrVSrc, 00555 int chrFilterSize, const int16_t **alpSrc, 00556 uint8_t *dest, int dstW, int dstY) 00557 { 00558 x86_reg dummy=0; 00559 x86_reg dstW_reg = dstW; 00560 x86_reg uv_off = c->uv_off << 1; 00561 00562 YSCALEYUV2PACKEDX_ACCURATE 00563 YSCALEYUV2RGBX 00564 "pxor %%mm7, %%mm7 \n\t" 00565 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00566 #ifdef DITHER1XBPP 00567 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 00568 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 00569 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 00570 #endif 00571 WRITERGB16(%4, %5, %%REGa) 00572 YSCALEYUV2PACKEDX_END 00573 } 00574 00575 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, 00576 const int16_t **lumSrc, int lumFilterSize, 00577 const int16_t *chrFilter, const int16_t **chrUSrc, 00578 const int16_t **chrVSrc, 00579 int chrFilterSize, const int16_t **alpSrc, 00580 uint8_t *dest, int dstW, int dstY) 00581 { 00582 x86_reg dummy=0; 00583 x86_reg dstW_reg = dstW; 00584 x86_reg uv_off = c->uv_off << 1; 00585 00586 YSCALEYUV2PACKEDX 00587 YSCALEYUV2RGBX 00588 "pxor %%mm7, %%mm7 \n\t" 00589 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00590 #ifdef DITHER1XBPP 00591 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 00592 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 00593 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 00594 #endif 00595 WRITERGB16(%4, %5, %%REGa) 00596 YSCALEYUV2PACKEDX_END 00597 } 00598 00599 #define REAL_WRITERGB15(dst, dstw, index) \ 00600 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 00601 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 00602 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 00603 "psrlq $3, %%mm2 \n\t"\ 00604 "psrlq $1, %%mm5 \n\t"\ 00605 \ 00606 "movq %%mm2, %%mm1 \n\t"\ 00607 "movq %%mm4, %%mm3 \n\t"\ 00608 \ 00609 "punpcklbw %%mm7, %%mm3 \n\t"\ 00610 "punpcklbw %%mm5, %%mm2 \n\t"\ 00611 "punpckhbw %%mm7, %%mm4 \n\t"\ 00612 "punpckhbw %%mm5, %%mm1 \n\t"\ 00613 \ 00614 "psllq $2, %%mm3 \n\t"\ 00615 "psllq $2, %%mm4 \n\t"\ 00616 \ 00617 "por %%mm3, %%mm2 \n\t"\ 00618 "por %%mm4, %%mm1 \n\t"\ 00619 \ 00620 MOVNTQ(%%mm2, (dst, index, 2))\ 00621 MOVNTQ(%%mm1, 8(dst, index, 2))\ 00622 \ 00623 "add $8, "#index" \n\t"\ 00624 "cmp "#dstw", "#index" \n\t"\ 00625 " jb 1b \n\t" 00626 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 00627 00628 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, 00629 const int16_t **lumSrc, int lumFilterSize, 00630 const int16_t *chrFilter, const int16_t **chrUSrc, 00631 const int16_t **chrVSrc, 00632 int chrFilterSize, const int16_t **alpSrc, 00633 uint8_t *dest, int dstW, int dstY) 00634 { 00635 x86_reg dummy=0; 00636 x86_reg dstW_reg = dstW; 00637 x86_reg uv_off = c->uv_off << 1; 00638 00639 YSCALEYUV2PACKEDX_ACCURATE 00640 YSCALEYUV2RGBX 00641 "pxor %%mm7, %%mm7 \n\t" 00642 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00643 #ifdef DITHER1XBPP 00644 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 00645 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 00646 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 00647 #endif 00648 WRITERGB15(%4, %5, %%REGa) 00649 YSCALEYUV2PACKEDX_END 00650 } 00651 00652 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, 00653 const int16_t **lumSrc, int lumFilterSize, 00654 const int16_t *chrFilter, const int16_t **chrUSrc, 00655 const int16_t **chrVSrc, 00656 int chrFilterSize, const int16_t **alpSrc, 00657 uint8_t *dest, int dstW, int dstY) 00658 { 00659 x86_reg dummy=0; 00660 x86_reg dstW_reg = dstW; 00661 x86_reg uv_off = c->uv_off << 1; 00662 00663 YSCALEYUV2PACKEDX 00664 YSCALEYUV2RGBX 00665 "pxor %%mm7, %%mm7 \n\t" 00666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00667 #ifdef DITHER1XBPP 00668 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 00669 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 00670 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 00671 #endif 00672 WRITERGB15(%4, %5, %%REGa) 00673 YSCALEYUV2PACKEDX_END 00674 } 00675 00676 #define WRITEBGR24MMX(dst, dstw, index) \ 00677 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 00678 "movq %%mm2, %%mm1 \n\t" /* B */\ 00679 "movq %%mm5, %%mm6 \n\t" /* R */\ 00680 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 00681 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 00682 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 00683 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 00684 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 00685 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 00686 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 00687 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 00688 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 00689 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 00690 \ 00691 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 00692 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 00693 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 00694 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 00695 \ 00696 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 00697 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 00698 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 00699 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 00700 \ 00701 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 00702 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 00703 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 00704 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 00705 \ 00706 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 00707 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 00708 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 00709 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 00710 MOVNTQ(%%mm0, (dst))\ 00711 \ 00712 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 00713 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 00714 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 00715 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 00716 MOVNTQ(%%mm6, 8(dst))\ 00717 \ 00718 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 00719 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 00720 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 00721 MOVNTQ(%%mm5, 16(dst))\ 00722 \ 00723 "add $24, "#dst" \n\t"\ 00724 \ 00725 "add $8, "#index" \n\t"\ 00726 "cmp "#dstw", "#index" \n\t"\ 00727 " jb 1b \n\t" 00728 00729 #define WRITEBGR24MMX2(dst, dstw, index) \ 00730 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 00731 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 00732 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 00733 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 00734 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 00735 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 00736 \ 00737 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 00738 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 00739 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 00740 \ 00741 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 00742 "por %%mm1, %%mm6 \n\t"\ 00743 "por %%mm3, %%mm6 \n\t"\ 00744 MOVNTQ(%%mm6, (dst))\ 00745 \ 00746 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 00747 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 00748 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 00749 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 00750 \ 00751 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 00752 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 00753 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 00754 \ 00755 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 00756 "por %%mm3, %%mm6 \n\t"\ 00757 MOVNTQ(%%mm6, 8(dst))\ 00758 \ 00759 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 00760 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 00761 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 00762 \ 00763 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 00764 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 00765 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 00766 \ 00767 "por %%mm1, %%mm3 \n\t"\ 00768 "por %%mm3, %%mm6 \n\t"\ 00769 MOVNTQ(%%mm6, 16(dst))\ 00770 \ 00771 "add $24, "#dst" \n\t"\ 00772 \ 00773 "add $8, "#index" \n\t"\ 00774 "cmp "#dstw", "#index" \n\t"\ 00775 " jb 1b \n\t" 00776 00777 #if COMPILE_TEMPLATE_MMX2 00778 #undef WRITEBGR24 00779 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) 00780 #else 00781 #undef WRITEBGR24 00782 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 00783 #endif 00784 00785 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, 00786 const int16_t **lumSrc, int lumFilterSize, 00787 const int16_t *chrFilter, const int16_t **chrUSrc, 00788 const int16_t **chrVSrc, 00789 int chrFilterSize, const int16_t **alpSrc, 00790 uint8_t *dest, int dstW, int dstY) 00791 { 00792 x86_reg dummy=0; 00793 x86_reg dstW_reg = dstW; 00794 x86_reg uv_off = c->uv_off << 1; 00795 00796 YSCALEYUV2PACKEDX_ACCURATE 00797 YSCALEYUV2RGBX 00798 "pxor %%mm7, %%mm7 \n\t" 00799 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize 00800 "add %4, %%"REG_c" \n\t" 00801 WRITEBGR24(%%REGc, %5, %%REGa) 00802 :: "r" (&c->redDither), 00803 "m" (dummy), "m" (dummy), "m" (dummy), 00804 "r" (dest), "m" (dstW_reg), "m"(uv_off) 00805 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 00806 ); 00807 } 00808 00809 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, 00810 const int16_t **lumSrc, int lumFilterSize, 00811 const int16_t *chrFilter, const int16_t **chrUSrc, 00812 const int16_t **chrVSrc, 00813 int chrFilterSize, const int16_t **alpSrc, 00814 uint8_t *dest, int dstW, int dstY) 00815 { 00816 x86_reg dummy=0; 00817 x86_reg dstW_reg = dstW; 00818 x86_reg uv_off = c->uv_off << 1; 00819 00820 YSCALEYUV2PACKEDX 00821 YSCALEYUV2RGBX 00822 "pxor %%mm7, %%mm7 \n\t" 00823 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize 00824 "add %4, %%"REG_c" \n\t" 00825 WRITEBGR24(%%REGc, %5, %%REGa) 00826 :: "r" (&c->redDither), 00827 "m" (dummy), "m" (dummy), "m" (dummy), 00828 "r" (dest), "m" (dstW_reg), "m"(uv_off) 00829 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 00830 ); 00831 } 00832 00833 #define REAL_WRITEYUY2(dst, dstw, index) \ 00834 "packuswb %%mm3, %%mm3 \n\t"\ 00835 "packuswb %%mm4, %%mm4 \n\t"\ 00836 "packuswb %%mm7, %%mm1 \n\t"\ 00837 "punpcklbw %%mm4, %%mm3 \n\t"\ 00838 "movq %%mm1, %%mm7 \n\t"\ 00839 "punpcklbw %%mm3, %%mm1 \n\t"\ 00840 "punpckhbw %%mm3, %%mm7 \n\t"\ 00841 \ 00842 MOVNTQ(%%mm1, (dst, index, 2))\ 00843 MOVNTQ(%%mm7, 8(dst, index, 2))\ 00844 \ 00845 "add $8, "#index" \n\t"\ 00846 "cmp "#dstw", "#index" \n\t"\ 00847 " jb 1b \n\t" 00848 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 00849 00850 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, 00851 const int16_t **lumSrc, int lumFilterSize, 00852 const int16_t *chrFilter, const int16_t **chrUSrc, 00853 const int16_t **chrVSrc, 00854 int chrFilterSize, const int16_t **alpSrc, 00855 uint8_t *dest, int dstW, int dstY) 00856 { 00857 x86_reg dummy=0; 00858 x86_reg dstW_reg = dstW; 00859 x86_reg uv_off = c->uv_off << 1; 00860 00861 YSCALEYUV2PACKEDX_ACCURATE 00862 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00863 "psraw $3, %%mm3 \n\t" 00864 "psraw $3, %%mm4 \n\t" 00865 "psraw $3, %%mm1 \n\t" 00866 "psraw $3, %%mm7 \n\t" 00867 WRITEYUY2(%4, %5, %%REGa) 00868 YSCALEYUV2PACKEDX_END 00869 } 00870 00871 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, 00872 const int16_t **lumSrc, int lumFilterSize, 00873 const int16_t *chrFilter, const int16_t **chrUSrc, 00874 const int16_t **chrVSrc, 00875 int chrFilterSize, const int16_t **alpSrc, 00876 uint8_t *dest, int dstW, int dstY) 00877 { 00878 x86_reg dummy=0; 00879 x86_reg dstW_reg = dstW; 00880 x86_reg uv_off = c->uv_off << 1; 00881 00882 YSCALEYUV2PACKEDX 00883 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 00884 "psraw $3, %%mm3 \n\t" 00885 "psraw $3, %%mm4 \n\t" 00886 "psraw $3, %%mm1 \n\t" 00887 "psraw $3, %%mm7 \n\t" 00888 WRITEYUY2(%4, %5, %%REGa) 00889 YSCALEYUV2PACKEDX_END 00890 } 00891 00892 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 00893 "xor "#index", "#index" \n\t"\ 00894 ".p2align 4 \n\t"\ 00895 "1: \n\t"\ 00896 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 00897 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 00898 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 00899 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 00900 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 00901 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 00902 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 00903 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 00904 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 00905 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 00906 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 00907 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 00908 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 00909 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 00910 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 00911 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 00912 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 00913 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00914 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00915 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 00916 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 00917 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00918 00919 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 00920 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 00921 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 00922 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 00923 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 00924 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 00925 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 00926 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00927 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00928 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00929 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00930 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00931 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00932 00933 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 00934 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 00935 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 00936 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 00937 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 00938 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 00939 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 00940 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00941 "paddw %%mm3, %%mm4 \n\t"\ 00942 "movq %%mm2, %%mm0 \n\t"\ 00943 "movq %%mm5, %%mm6 \n\t"\ 00944 "movq %%mm4, %%mm3 \n\t"\ 00945 "punpcklwd %%mm2, %%mm2 \n\t"\ 00946 "punpcklwd %%mm5, %%mm5 \n\t"\ 00947 "punpcklwd %%mm4, %%mm4 \n\t"\ 00948 "paddw %%mm1, %%mm2 \n\t"\ 00949 "paddw %%mm1, %%mm5 \n\t"\ 00950 "paddw %%mm1, %%mm4 \n\t"\ 00951 "punpckhwd %%mm0, %%mm0 \n\t"\ 00952 "punpckhwd %%mm6, %%mm6 \n\t"\ 00953 "punpckhwd %%mm3, %%mm3 \n\t"\ 00954 "paddw %%mm7, %%mm0 \n\t"\ 00955 "paddw %%mm7, %%mm6 \n\t"\ 00956 "paddw %%mm7, %%mm3 \n\t"\ 00957 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00958 "packuswb %%mm0, %%mm2 \n\t"\ 00959 "packuswb %%mm6, %%mm5 \n\t"\ 00960 "packuswb %%mm3, %%mm4 \n\t"\ 00961 00962 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 00963 00964 #define YSCALEYUV2RGB(index, c) \ 00965 REAL_YSCALEYUV2RGB_UV(index, c) \ 00966 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 00967 REAL_YSCALEYUV2RGB_COEFF(c) 00968 00972 static void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, 00973 const uint16_t *buf1, const uint16_t *ubuf0, 00974 const uint16_t *ubuf1, const uint16_t *vbuf0, 00975 const uint16_t *vbuf1, const uint16_t *abuf0, 00976 const uint16_t *abuf1, uint8_t *dest, 00977 int dstW, int yalpha, int uvalpha, int y) 00978 { 00979 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 00980 #if ARCH_X86_64 00981 __asm__ volatile( 00982 YSCALEYUV2RGB(%%r8, %5) 00983 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 00984 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 00985 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 00986 "packuswb %%mm7, %%mm1 \n\t" 00987 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 00988 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), 00989 "a" (&c->redDither), 00990 "r" (abuf0), "r" (abuf1) 00991 : "%r8" 00992 ); 00993 #else 00994 *(const uint16_t **)(&c->u_temp)=abuf0; 00995 *(const uint16_t **)(&c->v_temp)=abuf1; 00996 __asm__ volatile( 00997 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 00998 "mov %4, %%"REG_b" \n\t" 00999 "push %%"REG_BP" \n\t" 01000 YSCALEYUV2RGB(%%REGBP, %5) 01001 "push %0 \n\t" 01002 "push %1 \n\t" 01003 "mov "U_TEMP"(%5), %0 \n\t" 01004 "mov "V_TEMP"(%5), %1 \n\t" 01005 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) 01006 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01007 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01008 "packuswb %%mm7, %%mm1 \n\t" 01009 "pop %1 \n\t" 01010 "pop %0 \n\t" 01011 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 01012 "pop %%"REG_BP" \n\t" 01013 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01014 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01015 "a" (&c->redDither) 01016 ); 01017 #endif 01018 } else { 01019 __asm__ volatile( 01020 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01021 "mov %4, %%"REG_b" \n\t" 01022 "push %%"REG_BP" \n\t" 01023 YSCALEYUV2RGB(%%REGBP, %5) 01024 "pcmpeqd %%mm7, %%mm7 \n\t" 01025 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01026 "pop %%"REG_BP" \n\t" 01027 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01028 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01029 "a" (&c->redDither) 01030 ); 01031 } 01032 } 01033 01034 static void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0, 01035 const uint16_t *buf1, const uint16_t *ubuf0, 01036 const uint16_t *ubuf1, const uint16_t *vbuf0, 01037 const uint16_t *vbuf1, const uint16_t *abuf0, 01038 const uint16_t *abuf1, uint8_t *dest, 01039 int dstW, int yalpha, int uvalpha, int y) 01040 { 01041 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 01042 __asm__ volatile( 01043 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01044 "mov %4, %%"REG_b" \n\t" 01045 "push %%"REG_BP" \n\t" 01046 YSCALEYUV2RGB(%%REGBP, %5) 01047 "pxor %%mm7, %%mm7 \n\t" 01048 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01049 "pop %%"REG_BP" \n\t" 01050 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01051 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01052 "a" (&c->redDither) 01053 ); 01054 } 01055 01056 static void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, 01057 const uint16_t *buf1, const uint16_t *ubuf0, 01058 const uint16_t *ubuf1, const uint16_t *vbuf0, 01059 const uint16_t *vbuf1, const uint16_t *abuf0, 01060 const uint16_t *abuf1, uint8_t *dest, 01061 int dstW, int yalpha, int uvalpha, int y) 01062 { 01063 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 01064 __asm__ volatile( 01065 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01066 "mov %4, %%"REG_b" \n\t" 01067 "push %%"REG_BP" \n\t" 01068 YSCALEYUV2RGB(%%REGBP, %5) 01069 "pxor %%mm7, %%mm7 \n\t" 01070 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01071 #ifdef DITHER1XBPP 01072 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01073 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01074 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01075 #endif 01076 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01077 "pop %%"REG_BP" \n\t" 01078 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01079 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01080 "a" (&c->redDither) 01081 ); 01082 } 01083 01084 static void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, 01085 const uint16_t *buf1, const uint16_t *ubuf0, 01086 const uint16_t *ubuf1, const uint16_t *vbuf0, 01087 const uint16_t *vbuf1, const uint16_t *abuf0, 01088 const uint16_t *abuf1, uint8_t *dest, 01089 int dstW, int yalpha, int uvalpha, int y) 01090 { 01091 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 01092 __asm__ volatile( 01093 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01094 "mov %4, %%"REG_b" \n\t" 01095 "push %%"REG_BP" \n\t" 01096 YSCALEYUV2RGB(%%REGBP, %5) 01097 "pxor %%mm7, %%mm7 \n\t" 01098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01099 #ifdef DITHER1XBPP 01100 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01101 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01102 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01103 #endif 01104 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01105 "pop %%"REG_BP" \n\t" 01106 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01107 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01108 "a" (&c->redDither) 01109 ); 01110 } 01111 01112 #define REAL_YSCALEYUV2PACKED(index, c) \ 01113 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 01114 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 01115 "psraw $3, %%mm0 \n\t"\ 01116 "psraw $3, %%mm1 \n\t"\ 01117 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 01118 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 01119 "xor "#index", "#index" \n\t"\ 01120 ".p2align 4 \n\t"\ 01121 "1: \n\t"\ 01122 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 01123 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 01124 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 01125 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 01126 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 01127 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 01128 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 01129 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 01130 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 01131 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 01132 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 01133 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 01134 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 01135 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 01136 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 01137 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 01138 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 01139 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 01140 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 01141 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 01142 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 01143 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 01144 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 01145 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01146 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01147 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 01148 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 01149 01150 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 01151 01152 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, 01153 const uint16_t *buf1, const uint16_t *ubuf0, 01154 const uint16_t *ubuf1, const uint16_t *vbuf0, 01155 const uint16_t *vbuf1, const uint16_t *abuf0, 01156 const uint16_t *abuf1, uint8_t *dest, 01157 int dstW, int yalpha, int uvalpha, int y) 01158 { 01159 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 01160 __asm__ volatile( 01161 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01162 "mov %4, %%"REG_b" \n\t" 01163 "push %%"REG_BP" \n\t" 01164 YSCALEYUV2PACKED(%%REGBP, %5) 01165 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01166 "pop %%"REG_BP" \n\t" 01167 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01168 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01169 "a" (&c->redDither) 01170 ); 01171 } 01172 01173 #define REAL_YSCALEYUV2RGB1(index, c) \ 01174 "xor "#index", "#index" \n\t"\ 01175 ".p2align 4 \n\t"\ 01176 "1: \n\t"\ 01177 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 01178 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 01179 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 01180 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 01181 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 01182 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 01183 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 01184 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 01185 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 01186 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 01187 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 01188 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 01189 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 01190 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 01191 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 01192 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01193 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01194 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 01195 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 01196 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 01197 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 01198 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 01199 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 01200 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 01201 "paddw %%mm3, %%mm4 \n\t"\ 01202 "movq %%mm2, %%mm0 \n\t"\ 01203 "movq %%mm5, %%mm6 \n\t"\ 01204 "movq %%mm4, %%mm3 \n\t"\ 01205 "punpcklwd %%mm2, %%mm2 \n\t"\ 01206 "punpcklwd %%mm5, %%mm5 \n\t"\ 01207 "punpcklwd %%mm4, %%mm4 \n\t"\ 01208 "paddw %%mm1, %%mm2 \n\t"\ 01209 "paddw %%mm1, %%mm5 \n\t"\ 01210 "paddw %%mm1, %%mm4 \n\t"\ 01211 "punpckhwd %%mm0, %%mm0 \n\t"\ 01212 "punpckhwd %%mm6, %%mm6 \n\t"\ 01213 "punpckhwd %%mm3, %%mm3 \n\t"\ 01214 "paddw %%mm7, %%mm0 \n\t"\ 01215 "paddw %%mm7, %%mm6 \n\t"\ 01216 "paddw %%mm7, %%mm3 \n\t"\ 01217 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 01218 "packuswb %%mm0, %%mm2 \n\t"\ 01219 "packuswb %%mm6, %%mm5 \n\t"\ 01220 "packuswb %%mm3, %%mm4 \n\t"\ 01221 01222 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 01223 01224 // do vertical chrominance interpolation 01225 #define REAL_YSCALEYUV2RGB1b(index, c) \ 01226 "xor "#index", "#index" \n\t"\ 01227 ".p2align 4 \n\t"\ 01228 "1: \n\t"\ 01229 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 01230 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 01231 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 01232 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 01233 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 01234 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 01235 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 01236 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 01237 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 01238 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 01239 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 01240 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 01241 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 01242 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 01243 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 01244 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 01245 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 01246 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 01247 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 01248 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01249 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 01250 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 01251 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 01252 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 01253 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 01254 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 01255 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 01256 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 01257 "paddw %%mm3, %%mm4 \n\t"\ 01258 "movq %%mm2, %%mm0 \n\t"\ 01259 "movq %%mm5, %%mm6 \n\t"\ 01260 "movq %%mm4, %%mm3 \n\t"\ 01261 "punpcklwd %%mm2, %%mm2 \n\t"\ 01262 "punpcklwd %%mm5, %%mm5 \n\t"\ 01263 "punpcklwd %%mm4, %%mm4 \n\t"\ 01264 "paddw %%mm1, %%mm2 \n\t"\ 01265 "paddw %%mm1, %%mm5 \n\t"\ 01266 "paddw %%mm1, %%mm4 \n\t"\ 01267 "punpckhwd %%mm0, %%mm0 \n\t"\ 01268 "punpckhwd %%mm6, %%mm6 \n\t"\ 01269 "punpckhwd %%mm3, %%mm3 \n\t"\ 01270 "paddw %%mm7, %%mm0 \n\t"\ 01271 "paddw %%mm7, %%mm6 \n\t"\ 01272 "paddw %%mm7, %%mm3 \n\t"\ 01273 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 01274 "packuswb %%mm0, %%mm2 \n\t"\ 01275 "packuswb %%mm6, %%mm5 \n\t"\ 01276 "packuswb %%mm3, %%mm4 \n\t"\ 01277 01278 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 01279 01280 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 01281 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 01282 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 01283 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 01284 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 01285 "packuswb %%mm1, %%mm7 \n\t" 01286 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 01287 01291 static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, 01292 const uint16_t *ubuf0, const uint16_t *ubuf1, 01293 const uint16_t *vbuf0, const uint16_t *vbuf1, 01294 const uint16_t *abuf0, uint8_t *dest, 01295 int dstW, int uvalpha, enum PixelFormat dstFormat, 01296 int flags, int y) 01297 { 01298 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01299 01300 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01301 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01302 __asm__ volatile( 01303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01304 "mov %4, %%"REG_b" \n\t" 01305 "push %%"REG_BP" \n\t" 01306 YSCALEYUV2RGB1(%%REGBP, %5) 01307 YSCALEYUV2RGB1_ALPHA(%%REGBP) 01308 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01309 "pop %%"REG_BP" \n\t" 01310 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01311 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01312 "a" (&c->redDither) 01313 ); 01314 } else { 01315 __asm__ volatile( 01316 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01317 "mov %4, %%"REG_b" \n\t" 01318 "push %%"REG_BP" \n\t" 01319 YSCALEYUV2RGB1(%%REGBP, %5) 01320 "pcmpeqd %%mm7, %%mm7 \n\t" 01321 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01322 "pop %%"REG_BP" \n\t" 01323 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01324 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01325 "a" (&c->redDither) 01326 ); 01327 } 01328 } else { 01329 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01330 __asm__ volatile( 01331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01332 "mov %4, %%"REG_b" \n\t" 01333 "push %%"REG_BP" \n\t" 01334 YSCALEYUV2RGB1b(%%REGBP, %5) 01335 YSCALEYUV2RGB1_ALPHA(%%REGBP) 01336 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01337 "pop %%"REG_BP" \n\t" 01338 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01339 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01340 "a" (&c->redDither) 01341 ); 01342 } else { 01343 __asm__ volatile( 01344 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01345 "mov %4, %%"REG_b" \n\t" 01346 "push %%"REG_BP" \n\t" 01347 YSCALEYUV2RGB1b(%%REGBP, %5) 01348 "pcmpeqd %%mm7, %%mm7 \n\t" 01349 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01350 "pop %%"REG_BP" \n\t" 01351 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01352 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01353 "a" (&c->redDither) 01354 ); 01355 } 01356 } 01357 } 01358 01359 static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, 01360 const uint16_t *ubuf0, const uint16_t *ubuf1, 01361 const uint16_t *vbuf0, const uint16_t *vbuf1, 01362 const uint16_t *abuf0, uint8_t *dest, 01363 int dstW, int uvalpha, enum PixelFormat dstFormat, 01364 int flags, int y) 01365 { 01366 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01367 01368 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01369 __asm__ volatile( 01370 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01371 "mov %4, %%"REG_b" \n\t" 01372 "push %%"REG_BP" \n\t" 01373 YSCALEYUV2RGB1(%%REGBP, %5) 01374 "pxor %%mm7, %%mm7 \n\t" 01375 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01376 "pop %%"REG_BP" \n\t" 01377 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01378 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01379 "a" (&c->redDither) 01380 ); 01381 } else { 01382 __asm__ volatile( 01383 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01384 "mov %4, %%"REG_b" \n\t" 01385 "push %%"REG_BP" \n\t" 01386 YSCALEYUV2RGB1b(%%REGBP, %5) 01387 "pxor %%mm7, %%mm7 \n\t" 01388 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01389 "pop %%"REG_BP" \n\t" 01390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01391 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01392 "a" (&c->redDither) 01393 ); 01394 } 01395 } 01396 01397 static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, 01398 const uint16_t *ubuf0, const uint16_t *ubuf1, 01399 const uint16_t *vbuf0, const uint16_t *vbuf1, 01400 const uint16_t *abuf0, uint8_t *dest, 01401 int dstW, int uvalpha, enum PixelFormat dstFormat, 01402 int flags, int y) 01403 { 01404 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01405 01406 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01407 __asm__ volatile( 01408 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01409 "mov %4, %%"REG_b" \n\t" 01410 "push %%"REG_BP" \n\t" 01411 YSCALEYUV2RGB1(%%REGBP, %5) 01412 "pxor %%mm7, %%mm7 \n\t" 01413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01414 #ifdef DITHER1XBPP 01415 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01416 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01417 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01418 #endif 01419 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01420 "pop %%"REG_BP" \n\t" 01421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01422 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01423 "a" (&c->redDither) 01424 ); 01425 } else { 01426 __asm__ volatile( 01427 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01428 "mov %4, %%"REG_b" \n\t" 01429 "push %%"REG_BP" \n\t" 01430 YSCALEYUV2RGB1b(%%REGBP, %5) 01431 "pxor %%mm7, %%mm7 \n\t" 01432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01433 #ifdef DITHER1XBPP 01434 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01435 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01436 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01437 #endif 01438 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01439 "pop %%"REG_BP" \n\t" 01440 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01441 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01442 "a" (&c->redDither) 01443 ); 01444 } 01445 } 01446 01447 static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, 01448 const uint16_t *ubuf0, const uint16_t *ubuf1, 01449 const uint16_t *vbuf0, const uint16_t *vbuf1, 01450 const uint16_t *abuf0, uint8_t *dest, 01451 int dstW, int uvalpha, enum PixelFormat dstFormat, 01452 int flags, int y) 01453 { 01454 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01455 01456 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01457 __asm__ volatile( 01458 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01459 "mov %4, %%"REG_b" \n\t" 01460 "push %%"REG_BP" \n\t" 01461 YSCALEYUV2RGB1(%%REGBP, %5) 01462 "pxor %%mm7, %%mm7 \n\t" 01463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01464 #ifdef DITHER1XBPP 01465 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01466 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01467 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01468 #endif 01469 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01470 "pop %%"REG_BP" \n\t" 01471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01472 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01473 "a" (&c->redDither) 01474 ); 01475 } else { 01476 __asm__ volatile( 01477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01478 "mov %4, %%"REG_b" \n\t" 01479 "push %%"REG_BP" \n\t" 01480 YSCALEYUV2RGB1b(%%REGBP, %5) 01481 "pxor %%mm7, %%mm7 \n\t" 01482 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01483 #ifdef DITHER1XBPP 01484 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01485 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01486 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01487 #endif 01488 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01489 "pop %%"REG_BP" \n\t" 01490 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01491 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01492 "a" (&c->redDither) 01493 ); 01494 } 01495 } 01496 01497 #define REAL_YSCALEYUV2PACKED1(index, c) \ 01498 "xor "#index", "#index" \n\t"\ 01499 ".p2align 4 \n\t"\ 01500 "1: \n\t"\ 01501 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 01502 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 01503 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 01504 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 01505 "psraw $7, %%mm3 \n\t" \ 01506 "psraw $7, %%mm4 \n\t" \ 01507 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 01508 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 01509 "psraw $7, %%mm1 \n\t" \ 01510 "psraw $7, %%mm7 \n\t" \ 01511 01512 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 01513 01514 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 01515 "xor "#index", "#index" \n\t"\ 01516 ".p2align 4 \n\t"\ 01517 "1: \n\t"\ 01518 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 01519 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 01520 "add "UV_OFFx2"("#c"), "#index" \n\t" \ 01521 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 01522 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 01523 "sub "UV_OFFx2"("#c"), "#index" \n\t" \ 01524 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 01525 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 01526 "psrlw $8, %%mm3 \n\t" \ 01527 "psrlw $8, %%mm4 \n\t" \ 01528 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 01529 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 01530 "psraw $7, %%mm1 \n\t" \ 01531 "psraw $7, %%mm7 \n\t" 01532 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 01533 01534 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, 01535 const uint16_t *ubuf0, const uint16_t *ubuf1, 01536 const uint16_t *vbuf0, const uint16_t *vbuf1, 01537 const uint16_t *abuf0, uint8_t *dest, 01538 int dstW, int uvalpha, enum PixelFormat dstFormat, 01539 int flags, int y) 01540 { 01541 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01542 01543 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01544 __asm__ volatile( 01545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01546 "mov %4, %%"REG_b" \n\t" 01547 "push %%"REG_BP" \n\t" 01548 YSCALEYUV2PACKED1(%%REGBP, %5) 01549 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01550 "pop %%"REG_BP" \n\t" 01551 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01552 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01553 "a" (&c->redDither) 01554 ); 01555 } else { 01556 __asm__ volatile( 01557 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01558 "mov %4, %%"REG_b" \n\t" 01559 "push %%"REG_BP" \n\t" 01560 YSCALEYUV2PACKED1b(%%REGBP, %5) 01561 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01562 "pop %%"REG_BP" \n\t" 01563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01564 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 01565 "a" (&c->redDither) 01566 ); 01567 } 01568 } 01569 01570 #if !COMPILE_TEMPLATE_MMX2 01571 //FIXME yuy2* can read up to 7 samples too much 01572 01573 static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, 01574 int width, uint32_t *unused) 01575 { 01576 __asm__ volatile( 01577 "movq "MANGLE(bm01010101)", %%mm2 \n\t" 01578 "mov %0, %%"REG_a" \n\t" 01579 "1: \n\t" 01580 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01581 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01582 "pand %%mm2, %%mm0 \n\t" 01583 "pand %%mm2, %%mm1 \n\t" 01584 "packuswb %%mm1, %%mm0 \n\t" 01585 "movq %%mm0, (%2, %%"REG_a") \n\t" 01586 "add $8, %%"REG_a" \n\t" 01587 " js 1b \n\t" 01588 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 01589 : "%"REG_a 01590 ); 01591 } 01592 01593 static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, 01594 const uint8_t *src1, const uint8_t *src2, 01595 int width, uint32_t *unused) 01596 { 01597 __asm__ volatile( 01598 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01599 "mov %0, %%"REG_a" \n\t" 01600 "1: \n\t" 01601 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 01602 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 01603 "psrlw $8, %%mm0 \n\t" 01604 "psrlw $8, %%mm1 \n\t" 01605 "packuswb %%mm1, %%mm0 \n\t" 01606 "movq %%mm0, %%mm1 \n\t" 01607 "psrlw $8, %%mm0 \n\t" 01608 "pand %%mm4, %%mm1 \n\t" 01609 "packuswb %%mm0, %%mm0 \n\t" 01610 "packuswb %%mm1, %%mm1 \n\t" 01611 "movd %%mm0, (%3, %%"REG_a") \n\t" 01612 "movd %%mm1, (%2, %%"REG_a") \n\t" 01613 "add $4, %%"REG_a" \n\t" 01614 " js 1b \n\t" 01615 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 01616 : "%"REG_a 01617 ); 01618 assert(src1 == src2); 01619 } 01620 01621 static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, 01622 const uint8_t *src1, const uint8_t *src2, 01623 int width, uint32_t *unused) 01624 { 01625 __asm__ volatile( 01626 "mov %0, %%"REG_a" \n\t" 01627 "1: \n\t" 01628 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01629 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01630 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 01631 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 01632 "psrlw $8, %%mm0 \n\t" 01633 "psrlw $8, %%mm1 \n\t" 01634 "psrlw $8, %%mm2 \n\t" 01635 "psrlw $8, %%mm3 \n\t" 01636 "packuswb %%mm1, %%mm0 \n\t" 01637 "packuswb %%mm3, %%mm2 \n\t" 01638 "movq %%mm0, (%3, %%"REG_a") \n\t" 01639 "movq %%mm2, (%4, %%"REG_a") \n\t" 01640 "add $8, %%"REG_a" \n\t" 01641 " js 1b \n\t" 01642 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 01643 : "%"REG_a 01644 ); 01645 } 01646 01647 /* This is almost identical to the previous, end exists only because 01648 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ 01649 static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, 01650 int width, uint32_t *unused) 01651 { 01652 __asm__ volatile( 01653 "mov %0, %%"REG_a" \n\t" 01654 "1: \n\t" 01655 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01656 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01657 "psrlw $8, %%mm0 \n\t" 01658 "psrlw $8, %%mm1 \n\t" 01659 "packuswb %%mm1, %%mm0 \n\t" 01660 "movq %%mm0, (%2, %%"REG_a") \n\t" 01661 "add $8, %%"REG_a" \n\t" 01662 " js 1b \n\t" 01663 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 01664 : "%"REG_a 01665 ); 01666 } 01667 01668 static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, 01669 const uint8_t *src1, const uint8_t *src2, 01670 int width, uint32_t *unused) 01671 { 01672 __asm__ volatile( 01673 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01674 "mov %0, %%"REG_a" \n\t" 01675 "1: \n\t" 01676 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 01677 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 01678 "pand %%mm4, %%mm0 \n\t" 01679 "pand %%mm4, %%mm1 \n\t" 01680 "packuswb %%mm1, %%mm0 \n\t" 01681 "movq %%mm0, %%mm1 \n\t" 01682 "psrlw $8, %%mm0 \n\t" 01683 "pand %%mm4, %%mm1 \n\t" 01684 "packuswb %%mm0, %%mm0 \n\t" 01685 "packuswb %%mm1, %%mm1 \n\t" 01686 "movd %%mm0, (%3, %%"REG_a") \n\t" 01687 "movd %%mm1, (%2, %%"REG_a") \n\t" 01688 "add $4, %%"REG_a" \n\t" 01689 " js 1b \n\t" 01690 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 01691 : "%"REG_a 01692 ); 01693 assert(src1 == src2); 01694 } 01695 01696 static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, 01697 const uint8_t *src1, const uint8_t *src2, 01698 int width, uint32_t *unused) 01699 { 01700 __asm__ volatile( 01701 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01702 "mov %0, %%"REG_a" \n\t" 01703 "1: \n\t" 01704 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01705 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01706 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 01707 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 01708 "pand %%mm4, %%mm0 \n\t" 01709 "pand %%mm4, %%mm1 \n\t" 01710 "pand %%mm4, %%mm2 \n\t" 01711 "pand %%mm4, %%mm3 \n\t" 01712 "packuswb %%mm1, %%mm0 \n\t" 01713 "packuswb %%mm3, %%mm2 \n\t" 01714 "movq %%mm0, (%3, %%"REG_a") \n\t" 01715 "movq %%mm2, (%4, %%"REG_a") \n\t" 01716 "add $8, %%"REG_a" \n\t" 01717 " js 1b \n\t" 01718 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 01719 : "%"REG_a 01720 ); 01721 } 01722 01723 static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, 01724 const uint8_t *src, int width) 01725 { 01726 __asm__ volatile( 01727 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01728 "mov %0, %%"REG_a" \n\t" 01729 "1: \n\t" 01730 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01731 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01732 "movq %%mm0, %%mm2 \n\t" 01733 "movq %%mm1, %%mm3 \n\t" 01734 "pand %%mm4, %%mm0 \n\t" 01735 "pand %%mm4, %%mm1 \n\t" 01736 "psrlw $8, %%mm2 \n\t" 01737 "psrlw $8, %%mm3 \n\t" 01738 "packuswb %%mm1, %%mm0 \n\t" 01739 "packuswb %%mm3, %%mm2 \n\t" 01740 "movq %%mm0, (%2, %%"REG_a") \n\t" 01741 "movq %%mm2, (%3, %%"REG_a") \n\t" 01742 "add $8, %%"REG_a" \n\t" 01743 " js 1b \n\t" 01744 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width) 01745 : "%"REG_a 01746 ); 01747 } 01748 01749 static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, 01750 const uint8_t *src1, const uint8_t *src2, 01751 int width, uint32_t *unused) 01752 { 01753 RENAME(nvXXtoUV)(dstU, dstV, src1, width); 01754 } 01755 01756 static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, 01757 const uint8_t *src1, const uint8_t *src2, 01758 int width, uint32_t *unused) 01759 { 01760 RENAME(nvXXtoUV)(dstV, dstU, src1, width); 01761 } 01762 #endif /* !COMPILE_TEMPLATE_MMX2 */ 01763 01764 static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, 01765 int width, enum PixelFormat srcFormat) 01766 { 01767 01768 if(srcFormat == PIX_FMT_BGR24) { 01769 __asm__ volatile( 01770 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" 01771 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" 01772 : 01773 ); 01774 } else { 01775 __asm__ volatile( 01776 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" 01777 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" 01778 : 01779 ); 01780 } 01781 01782 __asm__ volatile( 01783 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" 01784 "mov %2, %%"REG_a" \n\t" 01785 "pxor %%mm7, %%mm7 \n\t" 01786 "1: \n\t" 01787 PREFETCH" 64(%0) \n\t" 01788 "movd (%0), %%mm0 \n\t" 01789 "movd 2(%0), %%mm1 \n\t" 01790 "movd 6(%0), %%mm2 \n\t" 01791 "movd 8(%0), %%mm3 \n\t" 01792 "add $12, %0 \n\t" 01793 "punpcklbw %%mm7, %%mm0 \n\t" 01794 "punpcklbw %%mm7, %%mm1 \n\t" 01795 "punpcklbw %%mm7, %%mm2 \n\t" 01796 "punpcklbw %%mm7, %%mm3 \n\t" 01797 "pmaddwd %%mm5, %%mm0 \n\t" 01798 "pmaddwd %%mm6, %%mm1 \n\t" 01799 "pmaddwd %%mm5, %%mm2 \n\t" 01800 "pmaddwd %%mm6, %%mm3 \n\t" 01801 "paddd %%mm1, %%mm0 \n\t" 01802 "paddd %%mm3, %%mm2 \n\t" 01803 "paddd %%mm4, %%mm0 \n\t" 01804 "paddd %%mm4, %%mm2 \n\t" 01805 "psrad $15, %%mm0 \n\t" 01806 "psrad $15, %%mm2 \n\t" 01807 "packssdw %%mm2, %%mm0 \n\t" 01808 "packuswb %%mm0, %%mm0 \n\t" 01809 "movd %%mm0, (%1, %%"REG_a") \n\t" 01810 "add $4, %%"REG_a" \n\t" 01811 " js 1b \n\t" 01812 : "+r" (src) 01813 : "r" (dst+width), "g" ((x86_reg)-width) 01814 : "%"REG_a 01815 ); 01816 } 01817 01818 static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, 01819 int width, uint32_t *unused) 01820 { 01821 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); 01822 } 01823 01824 static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, 01825 int width, uint32_t *unused) 01826 { 01827 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); 01828 } 01829 01830 static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, 01831 const uint8_t *src, int width, 01832 enum PixelFormat srcFormat) 01833 { 01834 __asm__ volatile( 01835 "movq 24(%4), %%mm6 \n\t" 01836 "mov %3, %%"REG_a" \n\t" 01837 "pxor %%mm7, %%mm7 \n\t" 01838 "1: \n\t" 01839 PREFETCH" 64(%0) \n\t" 01840 "movd (%0), %%mm0 \n\t" 01841 "movd 2(%0), %%mm1 \n\t" 01842 "punpcklbw %%mm7, %%mm0 \n\t" 01843 "punpcklbw %%mm7, %%mm1 \n\t" 01844 "movq %%mm0, %%mm2 \n\t" 01845 "movq %%mm1, %%mm3 \n\t" 01846 "pmaddwd (%4), %%mm0 \n\t" 01847 "pmaddwd 8(%4), %%mm1 \n\t" 01848 "pmaddwd 16(%4), %%mm2 \n\t" 01849 "pmaddwd %%mm6, %%mm3 \n\t" 01850 "paddd %%mm1, %%mm0 \n\t" 01851 "paddd %%mm3, %%mm2 \n\t" 01852 01853 "movd 6(%0), %%mm1 \n\t" 01854 "movd 8(%0), %%mm3 \n\t" 01855 "add $12, %0 \n\t" 01856 "punpcklbw %%mm7, %%mm1 \n\t" 01857 "punpcklbw %%mm7, %%mm3 \n\t" 01858 "movq %%mm1, %%mm4 \n\t" 01859 "movq %%mm3, %%mm5 \n\t" 01860 "pmaddwd (%4), %%mm1 \n\t" 01861 "pmaddwd 8(%4), %%mm3 \n\t" 01862 "pmaddwd 16(%4), %%mm4 \n\t" 01863 "pmaddwd %%mm6, %%mm5 \n\t" 01864 "paddd %%mm3, %%mm1 \n\t" 01865 "paddd %%mm5, %%mm4 \n\t" 01866 01867 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" 01868 "paddd %%mm3, %%mm0 \n\t" 01869 "paddd %%mm3, %%mm2 \n\t" 01870 "paddd %%mm3, %%mm1 \n\t" 01871 "paddd %%mm3, %%mm4 \n\t" 01872 "psrad $15, %%mm0 \n\t" 01873 "psrad $15, %%mm2 \n\t" 01874 "psrad $15, %%mm1 \n\t" 01875 "psrad $15, %%mm4 \n\t" 01876 "packssdw %%mm1, %%mm0 \n\t" 01877 "packssdw %%mm4, %%mm2 \n\t" 01878 "packuswb %%mm0, %%mm0 \n\t" 01879 "packuswb %%mm2, %%mm2 \n\t" 01880 "movd %%mm0, (%1, %%"REG_a") \n\t" 01881 "movd %%mm2, (%2, %%"REG_a") \n\t" 01882 "add $4, %%"REG_a" \n\t" 01883 " js 1b \n\t" 01884 : "+r" (src) 01885 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) 01886 : "%"REG_a 01887 ); 01888 } 01889 01890 static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, 01891 const uint8_t *src1, const uint8_t *src2, 01892 int width, uint32_t *unused) 01893 { 01894 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); 01895 assert(src1 == src2); 01896 } 01897 01898 static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, 01899 const uint8_t *src1, const uint8_t *src2, 01900 int width, uint32_t *unused) 01901 { 01902 assert(src1==src2); 01903 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); 01904 } 01905 01906 #if !COMPILE_TEMPLATE_MMX2 01907 // bilinear / bicubic scaling 01908 static void RENAME(hScale)(int16_t *dst, int dstW, 01909 const uint8_t *src, int srcW, 01910 int xInc, const int16_t *filter, 01911 const int16_t *filterPos, int filterSize) 01912 { 01913 assert(filterSize % 4 == 0 && filterSize>0); 01914 if (filterSize==4) { // Always true for upscaling, sometimes for down, too. 01915 x86_reg counter= -2*dstW; 01916 filter-= counter*2; 01917 filterPos-= counter/2; 01918 dst-= counter/2; 01919 __asm__ volatile( 01920 #if defined(PIC) 01921 "push %%"REG_b" \n\t" 01922 #endif 01923 "pxor %%mm7, %%mm7 \n\t" 01924 "push %%"REG_BP" \n\t" // we use 7 regs here ... 01925 "mov %%"REG_a", %%"REG_BP" \n\t" 01926 ".p2align 4 \n\t" 01927 "1: \n\t" 01928 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 01929 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 01930 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" 01931 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" 01932 "movd (%3, %%"REG_a"), %%mm0 \n\t" 01933 "movd (%3, %%"REG_b"), %%mm2 \n\t" 01934 "punpcklbw %%mm7, %%mm0 \n\t" 01935 "punpcklbw %%mm7, %%mm2 \n\t" 01936 "pmaddwd %%mm1, %%mm0 \n\t" 01937 "pmaddwd %%mm2, %%mm3 \n\t" 01938 "movq %%mm0, %%mm4 \n\t" 01939 "punpckldq %%mm3, %%mm0 \n\t" 01940 "punpckhdq %%mm3, %%mm4 \n\t" 01941 "paddd %%mm4, %%mm0 \n\t" 01942 "psrad $7, %%mm0 \n\t" 01943 "packssdw %%mm0, %%mm0 \n\t" 01944 "movd %%mm0, (%4, %%"REG_BP") \n\t" 01945 "add $4, %%"REG_BP" \n\t" 01946 " jnc 1b \n\t" 01947 01948 "pop %%"REG_BP" \n\t" 01949 #if defined(PIC) 01950 "pop %%"REG_b" \n\t" 01951 #endif 01952 : "+a" (counter) 01953 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 01954 #if !defined(PIC) 01955 : "%"REG_b 01956 #endif 01957 ); 01958 } else if (filterSize==8) { 01959 x86_reg counter= -2*dstW; 01960 filter-= counter*4; 01961 filterPos-= counter/2; 01962 dst-= counter/2; 01963 __asm__ volatile( 01964 #if defined(PIC) 01965 "push %%"REG_b" \n\t" 01966 #endif 01967 "pxor %%mm7, %%mm7 \n\t" 01968 "push %%"REG_BP" \n\t" // we use 7 regs here ... 01969 "mov %%"REG_a", %%"REG_BP" \n\t" 01970 ".p2align 4 \n\t" 01971 "1: \n\t" 01972 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 01973 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 01974 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" 01975 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" 01976 "movd (%3, %%"REG_a"), %%mm0 \n\t" 01977 "movd (%3, %%"REG_b"), %%mm2 \n\t" 01978 "punpcklbw %%mm7, %%mm0 \n\t" 01979 "punpcklbw %%mm7, %%mm2 \n\t" 01980 "pmaddwd %%mm1, %%mm0 \n\t" 01981 "pmaddwd %%mm2, %%mm3 \n\t" 01982 01983 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" 01984 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" 01985 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" 01986 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" 01987 "punpcklbw %%mm7, %%mm4 \n\t" 01988 "punpcklbw %%mm7, %%mm2 \n\t" 01989 "pmaddwd %%mm1, %%mm4 \n\t" 01990 "pmaddwd %%mm2, %%mm5 \n\t" 01991 "paddd %%mm4, %%mm0 \n\t" 01992 "paddd %%mm5, %%mm3 \n\t" 01993 "movq %%mm0, %%mm4 \n\t" 01994 "punpckldq %%mm3, %%mm0 \n\t" 01995 "punpckhdq %%mm3, %%mm4 \n\t" 01996 "paddd %%mm4, %%mm0 \n\t" 01997 "psrad $7, %%mm0 \n\t" 01998 "packssdw %%mm0, %%mm0 \n\t" 01999 "movd %%mm0, (%4, %%"REG_BP") \n\t" 02000 "add $4, %%"REG_BP" \n\t" 02001 " jnc 1b \n\t" 02002 02003 "pop %%"REG_BP" \n\t" 02004 #if defined(PIC) 02005 "pop %%"REG_b" \n\t" 02006 #endif 02007 : "+a" (counter) 02008 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 02009 #if !defined(PIC) 02010 : "%"REG_b 02011 #endif 02012 ); 02013 } else { 02014 const uint8_t *offset = src+filterSize; 02015 x86_reg counter= -2*dstW; 02016 //filter-= counter*filterSize/2; 02017 filterPos-= counter/2; 02018 dst-= counter/2; 02019 __asm__ volatile( 02020 "pxor %%mm7, %%mm7 \n\t" 02021 ".p2align 4 \n\t" 02022 "1: \n\t" 02023 "mov %2, %%"REG_c" \n\t" 02024 "movzwl (%%"REG_c", %0), %%eax \n\t" 02025 "movzwl 2(%%"REG_c", %0), %%edx \n\t" 02026 "mov %5, %%"REG_c" \n\t" 02027 "pxor %%mm4, %%mm4 \n\t" 02028 "pxor %%mm5, %%mm5 \n\t" 02029 "2: \n\t" 02030 "movq (%1), %%mm1 \n\t" 02031 "movq (%1, %6), %%mm3 \n\t" 02032 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" 02033 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" 02034 "punpcklbw %%mm7, %%mm0 \n\t" 02035 "punpcklbw %%mm7, %%mm2 \n\t" 02036 "pmaddwd %%mm1, %%mm0 \n\t" 02037 "pmaddwd %%mm2, %%mm3 \n\t" 02038 "paddd %%mm3, %%mm5 \n\t" 02039 "paddd %%mm0, %%mm4 \n\t" 02040 "add $8, %1 \n\t" 02041 "add $4, %%"REG_c" \n\t" 02042 "cmp %4, %%"REG_c" \n\t" 02043 " jb 2b \n\t" 02044 "add %6, %1 \n\t" 02045 "movq %%mm4, %%mm0 \n\t" 02046 "punpckldq %%mm5, %%mm4 \n\t" 02047 "punpckhdq %%mm5, %%mm0 \n\t" 02048 "paddd %%mm0, %%mm4 \n\t" 02049 "psrad $7, %%mm4 \n\t" 02050 "packssdw %%mm4, %%mm4 \n\t" 02051 "mov %3, %%"REG_a" \n\t" 02052 "movd %%mm4, (%%"REG_a", %0) \n\t" 02053 "add $4, %0 \n\t" 02054 " jnc 1b \n\t" 02055 02056 : "+r" (counter), "+r" (filter) 02057 : "m" (filterPos), "m" (dst), "m"(offset), 02058 "m" (src), "r" ((x86_reg)filterSize*2) 02059 : "%"REG_a, "%"REG_c, "%"REG_d 02060 ); 02061 } 02062 } 02063 #endif /* !COMPILE_TEMPLATE_MMX2 */ 02064 02065 #if COMPILE_TEMPLATE_MMX2 02066 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, 02067 int dstWidth, const uint8_t *src, 02068 int srcW, int xInc) 02069 { 02070 int16_t *filterPos = c->hLumFilterPos; 02071 int16_t *filter = c->hLumFilter; 02072 void *mmx2FilterCode= c->lumMmx2FilterCode; 02073 int i; 02074 #if defined(PIC) 02075 uint64_t ebxsave; 02076 #endif 02077 #if ARCH_X86_64 02078 uint64_t retsave; 02079 #endif 02080 02081 __asm__ volatile( 02082 #if defined(PIC) 02083 "mov %%"REG_b", %5 \n\t" 02084 #if ARCH_X86_64 02085 "mov -8(%%rsp), %%"REG_a" \n\t" 02086 "mov %%"REG_a", %6 \n\t" 02087 #endif 02088 #else 02089 #if ARCH_X86_64 02090 "mov -8(%%rsp), %%"REG_a" \n\t" 02091 "mov %%"REG_a", %5 \n\t" 02092 #endif 02093 #endif 02094 "pxor %%mm7, %%mm7 \n\t" 02095 "mov %0, %%"REG_c" \n\t" 02096 "mov %1, %%"REG_D" \n\t" 02097 "mov %2, %%"REG_d" \n\t" 02098 "mov %3, %%"REG_b" \n\t" 02099 "xor %%"REG_a", %%"REG_a" \n\t" // i 02100 PREFETCH" (%%"REG_c") \n\t" 02101 PREFETCH" 32(%%"REG_c") \n\t" 02102 PREFETCH" 64(%%"REG_c") \n\t" 02103 02104 #if ARCH_X86_64 02105 #define CALL_MMX2_FILTER_CODE \ 02106 "movl (%%"REG_b"), %%esi \n\t"\ 02107 "call *%4 \n\t"\ 02108 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 02109 "add %%"REG_S", %%"REG_c" \n\t"\ 02110 "add %%"REG_a", %%"REG_D" \n\t"\ 02111 "xor %%"REG_a", %%"REG_a" \n\t"\ 02112 02113 #else 02114 #define CALL_MMX2_FILTER_CODE \ 02115 "movl (%%"REG_b"), %%esi \n\t"\ 02116 "call *%4 \n\t"\ 02117 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 02118 "add %%"REG_a", %%"REG_D" \n\t"\ 02119 "xor %%"REG_a", %%"REG_a" \n\t"\ 02120 02121 #endif /* ARCH_X86_64 */ 02122 02123 CALL_MMX2_FILTER_CODE 02124 CALL_MMX2_FILTER_CODE 02125 CALL_MMX2_FILTER_CODE 02126 CALL_MMX2_FILTER_CODE 02127 CALL_MMX2_FILTER_CODE 02128 CALL_MMX2_FILTER_CODE 02129 CALL_MMX2_FILTER_CODE 02130 CALL_MMX2_FILTER_CODE 02131 02132 #if defined(PIC) 02133 "mov %5, %%"REG_b" \n\t" 02134 #if ARCH_X86_64 02135 "mov %6, %%"REG_a" \n\t" 02136 "mov %%"REG_a", -8(%%rsp) \n\t" 02137 #endif 02138 #else 02139 #if ARCH_X86_64 02140 "mov %5, %%"REG_a" \n\t" 02141 "mov %%"REG_a", -8(%%rsp) \n\t" 02142 #endif 02143 #endif 02144 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), 02145 "m" (mmx2FilterCode) 02146 #if defined(PIC) 02147 ,"m" (ebxsave) 02148 #endif 02149 #if ARCH_X86_64 02150 ,"m"(retsave) 02151 #endif 02152 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 02153 #if !defined(PIC) 02154 ,"%"REG_b 02155 #endif 02156 ); 02157 02158 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) 02159 dst[i] = src[srcW-1]*128; 02160 } 02161 02162 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, 02163 int dstWidth, const uint8_t *src1, 02164 const uint8_t *src2, int srcW, int xInc) 02165 { 02166 int16_t *filterPos = c->hChrFilterPos; 02167 int16_t *filter = c->hChrFilter; 02168 void *mmx2FilterCode= c->chrMmx2FilterCode; 02169 int i; 02170 #if defined(PIC) 02171 DECLARE_ALIGNED(8, uint64_t, ebxsave); 02172 #endif 02173 #if ARCH_X86_64 02174 DECLARE_ALIGNED(8, uint64_t, retsave); 02175 #endif 02176 02177 __asm__ volatile( 02178 #if defined(PIC) 02179 "mov %%"REG_b", %7 \n\t" 02180 #if ARCH_X86_64 02181 "mov -8(%%rsp), %%"REG_a" \n\t" 02182 "mov %%"REG_a", %8 \n\t" 02183 #endif 02184 #else 02185 #if ARCH_X86_64 02186 "mov -8(%%rsp), %%"REG_a" \n\t" 02187 "mov %%"REG_a", %7 \n\t" 02188 #endif 02189 #endif 02190 "pxor %%mm7, %%mm7 \n\t" 02191 "mov %0, %%"REG_c" \n\t" 02192 "mov %1, %%"REG_D" \n\t" 02193 "mov %2, %%"REG_d" \n\t" 02194 "mov %3, %%"REG_b" \n\t" 02195 "xor %%"REG_a", %%"REG_a" \n\t" // i 02196 PREFETCH" (%%"REG_c") \n\t" 02197 PREFETCH" 32(%%"REG_c") \n\t" 02198 PREFETCH" 64(%%"REG_c") \n\t" 02199 02200 CALL_MMX2_FILTER_CODE 02201 CALL_MMX2_FILTER_CODE 02202 CALL_MMX2_FILTER_CODE 02203 CALL_MMX2_FILTER_CODE 02204 "xor %%"REG_a", %%"REG_a" \n\t" // i 02205 "mov %5, %%"REG_c" \n\t" // src 02206 "mov %6, %%"REG_D" \n\t" // buf2 02207 PREFETCH" (%%"REG_c") \n\t" 02208 PREFETCH" 32(%%"REG_c") \n\t" 02209 PREFETCH" 64(%%"REG_c") \n\t" 02210 02211 CALL_MMX2_FILTER_CODE 02212 CALL_MMX2_FILTER_CODE 02213 CALL_MMX2_FILTER_CODE 02214 CALL_MMX2_FILTER_CODE 02215 02216 #if defined(PIC) 02217 "mov %7, %%"REG_b" \n\t" 02218 #if ARCH_X86_64 02219 "mov %8, %%"REG_a" \n\t" 02220 "mov %%"REG_a", -8(%%rsp) \n\t" 02221 #endif 02222 #else 02223 #if ARCH_X86_64 02224 "mov %7, %%"REG_a" \n\t" 02225 "mov %%"REG_a", -8(%%rsp) \n\t" 02226 #endif 02227 #endif 02228 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), 02229 "m" (mmx2FilterCode), "m" (src2), "m"(dst2) 02230 #if defined(PIC) 02231 ,"m" (ebxsave) 02232 #endif 02233 #if ARCH_X86_64 02234 ,"m"(retsave) 02235 #endif 02236 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 02237 #if !defined(PIC) 02238 ,"%"REG_b 02239 #endif 02240 ); 02241 02242 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { 02243 dst1[i] = src1[srcW-1]*128; 02244 dst2[i] = src2[srcW-1]*128; 02245 } 02246 } 02247 #endif /* COMPILE_TEMPLATE_MMX2 */ 02248 02249 static av_cold void RENAME(sws_init_swScale)(SwsContext *c) 02250 { 02251 enum PixelFormat srcFormat = c->srcFormat, 02252 dstFormat = c->dstFormat; 02253 02254 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && 02255 dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) { 02256 if (!(c->flags & SWS_BITEXACT)) { 02257 if (c->flags & SWS_ACCURATE_RND) { 02258 c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); 02259 c->yuv2yuvX = RENAME(yuv2yuvX_ar ); 02260 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 02261 switch (c->dstFormat) { 02262 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; 02263 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; 02264 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; 02265 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; 02266 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; 02267 default: break; 02268 } 02269 } 02270 } else { 02271 c->yuv2yuv1 = RENAME(yuv2yuv1 ); 02272 c->yuv2yuvX = RENAME(yuv2yuvX ); 02273 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 02274 switch (c->dstFormat) { 02275 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; 02276 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; 02277 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; 02278 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; 02279 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; 02280 default: break; 02281 } 02282 } 02283 } 02284 } 02285 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 02286 switch (c->dstFormat) { 02287 case PIX_FMT_RGB32: 02288 c->yuv2packed1 = RENAME(yuv2rgb32_1); 02289 c->yuv2packed2 = RENAME(yuv2rgb32_2); 02290 break; 02291 case PIX_FMT_BGR24: 02292 c->yuv2packed1 = RENAME(yuv2bgr24_1); 02293 c->yuv2packed2 = RENAME(yuv2bgr24_2); 02294 break; 02295 case PIX_FMT_RGB555: 02296 c->yuv2packed1 = RENAME(yuv2rgb555_1); 02297 c->yuv2packed2 = RENAME(yuv2rgb555_2); 02298 break; 02299 case PIX_FMT_RGB565: 02300 c->yuv2packed1 = RENAME(yuv2rgb565_1); 02301 c->yuv2packed2 = RENAME(yuv2rgb565_2); 02302 break; 02303 case PIX_FMT_YUYV422: 02304 c->yuv2packed1 = RENAME(yuv2yuyv422_1); 02305 c->yuv2packed2 = RENAME(yuv2yuyv422_2); 02306 break; 02307 default: 02308 break; 02309 } 02310 } 02311 } 02312 02313 #if !COMPILE_TEMPLATE_MMX2 02314 c->hScale = RENAME(hScale ); 02315 #endif /* !COMPILE_TEMPLATE_MMX2 */ 02316 02317 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). 02318 #if COMPILE_TEMPLATE_MMX2 02319 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) 02320 { 02321 c->hyscale_fast = RENAME(hyscale_fast); 02322 c->hcscale_fast = RENAME(hcscale_fast); 02323 } else { 02324 #endif /* COMPILE_TEMPLATE_MMX2 */ 02325 c->hyscale_fast = NULL; 02326 c->hcscale_fast = NULL; 02327 #if COMPILE_TEMPLATE_MMX2 02328 } 02329 #endif /* COMPILE_TEMPLATE_MMX2 */ 02330 02331 #if !COMPILE_TEMPLATE_MMX2 02332 switch(srcFormat) { 02333 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; 02334 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; 02335 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; 02336 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; 02337 case PIX_FMT_YUV420P16BE: 02338 case PIX_FMT_YUV422P16BE: 02339 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break; 02340 case PIX_FMT_YUV420P16LE: 02341 case PIX_FMT_YUV422P16LE: 02342 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break; 02343 default: break; 02344 } 02345 #endif /* !COMPILE_TEMPLATE_MMX2 */ 02346 if (!c->chrSrcHSubSample) { 02347 switch(srcFormat) { 02348 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; 02349 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break; 02350 default: break; 02351 } 02352 } 02353 02354 switch (srcFormat) { 02355 #if !COMPILE_TEMPLATE_MMX2 02356 case PIX_FMT_YUYV422 : 02357 case PIX_FMT_YUV420P16BE: 02358 case PIX_FMT_YUV422P16BE: 02359 case PIX_FMT_YUV444P16BE: 02360 case PIX_FMT_Y400A : 02361 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break; 02362 case PIX_FMT_UYVY422 : 02363 case PIX_FMT_YUV420P16LE: 02364 case PIX_FMT_YUV422P16LE: 02365 case PIX_FMT_YUV444P16LE: 02366 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break; 02367 #endif /* !COMPILE_TEMPLATE_MMX2 */ 02368 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; 02369 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; 02370 default: break; 02371 } 02372 #if !COMPILE_TEMPLATE_MMX2 02373 if (c->alpPixBuf) { 02374 switch (srcFormat) { 02375 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break; 02376 default: break; 02377 } 02378 } 02379 #endif /* !COMPILE_TEMPLATE_MMX2 */ 02380 }