Libav 0.7.1
|
00001 /* 00002 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 00003 * 00004 * This file is part of Libav. 00005 * 00006 * Libav is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * Libav is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with Libav; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #ifdef DEBUG 00022 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 00023 #else 00024 #define ASSERT_ALIGNED(ptr) ; 00025 #endif 00026 00027 /* this code assume that stride % 16 == 0 */ 00028 00029 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 00030 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 00031 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 00032 \ 00033 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 00034 psum = vec_mladd(vB, vsrc1ssH, psum);\ 00035 psum = vec_mladd(vC, vsrc2ssH, psum);\ 00036 psum = vec_mladd(vD, vsrc3ssH, psum);\ 00037 psum = BIAS2(psum);\ 00038 psum = vec_sr(psum, v6us);\ 00039 \ 00040 vdst = vec_ld(0, dst);\ 00041 ppsum = (vec_u8)vec_pack(psum, psum);\ 00042 vfdst = vec_perm(vdst, ppsum, fperm);\ 00043 \ 00044 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00045 \ 00046 vec_st(fsum, 0, dst);\ 00047 \ 00048 vsrc0ssH = vsrc2ssH;\ 00049 vsrc1ssH = vsrc3ssH;\ 00050 \ 00051 dst += stride;\ 00052 src += stride; 00053 00054 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 00055 \ 00056 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 00057 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 00058 \ 00059 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 00060 psum = vec_mladd(vE, vsrc1ssH, psum);\ 00061 psum = vec_sr(psum, v6us);\ 00062 \ 00063 vdst = vec_ld(0, dst);\ 00064 ppsum = (vec_u8)vec_pack(psum, psum);\ 00065 vfdst = vec_perm(vdst, ppsum, fperm);\ 00066 \ 00067 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00068 \ 00069 vec_st(fsum, 0, dst);\ 00070 \ 00071 dst += stride;\ 00072 src += stride; 00073 00074 #define noop(a) a 00075 #define add28(a) vec_add(v28ss, a) 00076 00077 #ifdef PREFIX_h264_chroma_mc8_altivec 00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 00079 int stride, int h, int x, int y) { 00080 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 00081 {((8 - x) * (8 - y)), 00082 (( x) * (8 - y)), 00083 ((8 - x) * ( y)), 00084 (( x) * ( y))}; 00085 register int i; 00086 vec_u8 fperm; 00087 const vec_s32 vABCD = vec_ld(0, ABCD); 00088 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 00089 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 00090 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 00091 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 00092 LOAD_ZERO; 00093 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 00094 const vec_u16 v6us = vec_splat_u16(6); 00095 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 00096 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 00097 00098 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 00099 vec_u8 vsrc0uc, vsrc1uc; 00100 vec_s16 vsrc0ssH, vsrc1ssH; 00101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 00102 vec_s16 vsrc2ssH, vsrc3ssH, psum; 00103 vec_u8 vdst, ppsum, vfdst, fsum; 00104 00105 if (((unsigned long)dst) % 16 == 0) { 00106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 00107 0x14, 0x15, 0x16, 0x17, 00108 0x08, 0x09, 0x0A, 0x0B, 00109 0x0C, 0x0D, 0x0E, 0x0F}; 00110 } else { 00111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 00112 0x04, 0x05, 0x06, 0x07, 00113 0x18, 0x19, 0x1A, 0x1B, 00114 0x1C, 0x1D, 0x1E, 0x1F}; 00115 } 00116 00117 vsrcAuc = vec_ld(0, src); 00118 00119 if (loadSecond) 00120 vsrcBuc = vec_ld(16, src); 00121 vsrcperm0 = vec_lvsl(0, src); 00122 vsrcperm1 = vec_lvsl(1, src); 00123 00124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 00125 if (reallyBadAlign) 00126 vsrc1uc = vsrcBuc; 00127 else 00128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 00129 00130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 00131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 00132 00133 if (ABCD[3]) { 00134 if (!loadSecond) {// -> !reallyBadAlign 00135 for (i = 0 ; i < h ; i++) { 00136 vsrcCuc = vec_ld(stride + 0, src); 00137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00139 00140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 00141 } 00142 } else { 00143 vec_u8 vsrcDuc; 00144 for (i = 0 ; i < h ; i++) { 00145 vsrcCuc = vec_ld(stride + 0, src); 00146 vsrcDuc = vec_ld(stride + 16, src); 00147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00148 if (reallyBadAlign) 00149 vsrc3uc = vsrcDuc; 00150 else 00151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00152 00153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 00154 } 00155 } 00156 } else { 00157 const vec_s16 vE = vec_add(vB, vC); 00158 if (ABCD[2]) { // x == 0 B == 0 00159 if (!loadSecond) {// -> !reallyBadAlign 00160 for (i = 0 ; i < h ; i++) { 00161 vsrcCuc = vec_ld(stride + 0, src); 00162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00164 00165 vsrc0uc = vsrc1uc; 00166 } 00167 } else { 00168 vec_u8 vsrcDuc; 00169 for (i = 0 ; i < h ; i++) { 00170 vsrcCuc = vec_ld(stride + 0, src); 00171 vsrcDuc = vec_ld(stride + 15, src); 00172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00174 00175 vsrc0uc = vsrc1uc; 00176 } 00177 } 00178 } else { // y == 0 C == 0 00179 if (!loadSecond) {// -> !reallyBadAlign 00180 for (i = 0 ; i < h ; i++) { 00181 vsrcCuc = vec_ld(0, src); 00182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00184 00185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00186 } 00187 } else { 00188 vec_u8 vsrcDuc; 00189 for (i = 0 ; i < h ; i++) { 00190 vsrcCuc = vec_ld(0, src); 00191 vsrcDuc = vec_ld(15, src); 00192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00193 if (reallyBadAlign) 00194 vsrc1uc = vsrcDuc; 00195 else 00196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00197 00198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00199 } 00200 } 00201 } 00202 } 00203 } 00204 #endif 00205 00206 /* this code assume that stride % 16 == 0 */ 00207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec 00208 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 00209 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 00210 {((8 - x) * (8 - y)), 00211 (( x) * (8 - y)), 00212 ((8 - x) * ( y)), 00213 (( x) * ( y))}; 00214 register int i; 00215 vec_u8 fperm; 00216 const vec_s32 vABCD = vec_ld(0, ABCD); 00217 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 00218 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 00219 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 00220 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 00221 LOAD_ZERO; 00222 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 00223 const vec_u16 v6us = vec_splat_u16(6); 00224 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 00225 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 00226 00227 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 00228 vec_u8 vsrc0uc, vsrc1uc; 00229 vec_s16 vsrc0ssH, vsrc1ssH; 00230 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 00231 vec_s16 vsrc2ssH, vsrc3ssH, psum; 00232 vec_u8 vdst, ppsum, vfdst, fsum; 00233 00234 if (((unsigned long)dst) % 16 == 0) { 00235 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 00236 0x14, 0x15, 0x16, 0x17, 00237 0x08, 0x09, 0x0A, 0x0B, 00238 0x0C, 0x0D, 0x0E, 0x0F}; 00239 } else { 00240 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 00241 0x04, 0x05, 0x06, 0x07, 00242 0x18, 0x19, 0x1A, 0x1B, 00243 0x1C, 0x1D, 0x1E, 0x1F}; 00244 } 00245 00246 vsrcAuc = vec_ld(0, src); 00247 00248 if (loadSecond) 00249 vsrcBuc = vec_ld(16, src); 00250 vsrcperm0 = vec_lvsl(0, src); 00251 vsrcperm1 = vec_lvsl(1, src); 00252 00253 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 00254 if (reallyBadAlign) 00255 vsrc1uc = vsrcBuc; 00256 else 00257 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 00258 00259 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); 00260 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); 00261 00262 if (!loadSecond) {// -> !reallyBadAlign 00263 for (i = 0 ; i < h ; i++) { 00264 00265 00266 vsrcCuc = vec_ld(stride + 0, src); 00267 00268 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00269 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00270 00271 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 00272 } 00273 } else { 00274 vec_u8 vsrcDuc; 00275 for (i = 0 ; i < h ; i++) { 00276 vsrcCuc = vec_ld(stride + 0, src); 00277 vsrcDuc = vec_ld(stride + 16, src); 00278 00279 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00280 if (reallyBadAlign) 00281 vsrc3uc = vsrcDuc; 00282 else 00283 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00284 00285 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 00286 } 00287 } 00288 } 00289 #endif 00290 00291 #undef noop 00292 #undef add28 00293 #undef CHROMA_MC8_ALTIVEC_CORE 00294 00295 /* this code assume stride % 16 == 0 */ 00296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec 00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00298 register int i; 00299 00300 LOAD_ZERO; 00301 const vec_u8 permM2 = vec_lvsl(-2, src); 00302 const vec_u8 permM1 = vec_lvsl(-1, src); 00303 const vec_u8 permP0 = vec_lvsl(+0, src); 00304 const vec_u8 permP1 = vec_lvsl(+1, src); 00305 const vec_u8 permP2 = vec_lvsl(+2, src); 00306 const vec_u8 permP3 = vec_lvsl(+3, src); 00307 const vec_s16 v5ss = vec_splat_s16(5); 00308 const vec_u16 v5us = vec_splat_u16(5); 00309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00310 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00311 00312 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00313 00314 register int align = ((((unsigned long)src) - 2) % 16); 00315 00316 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00317 srcP2A, srcP2B, srcP3A, srcP3B, 00318 srcM1A, srcM1B, srcM2A, srcM2B, 00319 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00320 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00321 psumA, psumB, sumA, sumB; 00322 00323 vec_u8 sum, vdst, fsum; 00324 00325 for (i = 0 ; i < 16 ; i ++) { 00326 vec_u8 srcR1 = vec_ld(-2, src); 00327 vec_u8 srcR2 = vec_ld(14, src); 00328 00329 switch (align) { 00330 default: { 00331 srcM2 = vec_perm(srcR1, srcR2, permM2); 00332 srcM1 = vec_perm(srcR1, srcR2, permM1); 00333 srcP0 = vec_perm(srcR1, srcR2, permP0); 00334 srcP1 = vec_perm(srcR1, srcR2, permP1); 00335 srcP2 = vec_perm(srcR1, srcR2, permP2); 00336 srcP3 = vec_perm(srcR1, srcR2, permP3); 00337 } break; 00338 case 11: { 00339 srcM2 = vec_perm(srcR1, srcR2, permM2); 00340 srcM1 = vec_perm(srcR1, srcR2, permM1); 00341 srcP0 = vec_perm(srcR1, srcR2, permP0); 00342 srcP1 = vec_perm(srcR1, srcR2, permP1); 00343 srcP2 = vec_perm(srcR1, srcR2, permP2); 00344 srcP3 = srcR2; 00345 } break; 00346 case 12: { 00347 vec_u8 srcR3 = vec_ld(30, src); 00348 srcM2 = vec_perm(srcR1, srcR2, permM2); 00349 srcM1 = vec_perm(srcR1, srcR2, permM1); 00350 srcP0 = vec_perm(srcR1, srcR2, permP0); 00351 srcP1 = vec_perm(srcR1, srcR2, permP1); 00352 srcP2 = srcR2; 00353 srcP3 = vec_perm(srcR2, srcR3, permP3); 00354 } break; 00355 case 13: { 00356 vec_u8 srcR3 = vec_ld(30, src); 00357 srcM2 = vec_perm(srcR1, srcR2, permM2); 00358 srcM1 = vec_perm(srcR1, srcR2, permM1); 00359 srcP0 = vec_perm(srcR1, srcR2, permP0); 00360 srcP1 = srcR2; 00361 srcP2 = vec_perm(srcR2, srcR3, permP2); 00362 srcP3 = vec_perm(srcR2, srcR3, permP3); 00363 } break; 00364 case 14: { 00365 vec_u8 srcR3 = vec_ld(30, src); 00366 srcM2 = vec_perm(srcR1, srcR2, permM2); 00367 srcM1 = vec_perm(srcR1, srcR2, permM1); 00368 srcP0 = srcR2; 00369 srcP1 = vec_perm(srcR2, srcR3, permP1); 00370 srcP2 = vec_perm(srcR2, srcR3, permP2); 00371 srcP3 = vec_perm(srcR2, srcR3, permP3); 00372 } break; 00373 case 15: { 00374 vec_u8 srcR3 = vec_ld(30, src); 00375 srcM2 = vec_perm(srcR1, srcR2, permM2); 00376 srcM1 = srcR2; 00377 srcP0 = vec_perm(srcR2, srcR3, permP0); 00378 srcP1 = vec_perm(srcR2, srcR3, permP1); 00379 srcP2 = vec_perm(srcR2, srcR3, permP2); 00380 srcP3 = vec_perm(srcR2, srcR3, permP3); 00381 } break; 00382 } 00383 00384 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00385 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00386 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00387 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00388 00389 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00390 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00391 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00392 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00393 00394 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00395 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00396 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00397 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00398 00399 sum1A = vec_adds(srcP0A, srcP1A); 00400 sum1B = vec_adds(srcP0B, srcP1B); 00401 sum2A = vec_adds(srcM1A, srcP2A); 00402 sum2B = vec_adds(srcM1B, srcP2B); 00403 sum3A = vec_adds(srcM2A, srcP3A); 00404 sum3B = vec_adds(srcM2B, srcP3B); 00405 00406 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00407 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00408 00409 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00410 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00411 00412 pp3A = vec_add(sum3A, pp1A); 00413 pp3B = vec_add(sum3B, pp1B); 00414 00415 psumA = vec_sub(pp3A, pp2A); 00416 psumB = vec_sub(pp3B, pp2B); 00417 00418 sumA = vec_sra(psumA, v5us); 00419 sumB = vec_sra(psumB, v5us); 00420 00421 sum = vec_packsu(sumA, sumB); 00422 00423 ASSERT_ALIGNED(dst); 00424 vdst = vec_ld(0, dst); 00425 00426 OP_U8_ALTIVEC(fsum, sum, vdst); 00427 00428 vec_st(fsum, 0, dst); 00429 00430 src += srcStride; 00431 dst += dstStride; 00432 } 00433 } 00434 #endif 00435 00436 /* this code assume stride % 16 == 0 */ 00437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec 00438 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00439 register int i; 00440 00441 LOAD_ZERO; 00442 const vec_u8 perm = vec_lvsl(0, src); 00443 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00444 const vec_u16 v5us = vec_splat_u16(5); 00445 const vec_s16 v5ss = vec_splat_s16(5); 00446 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00447 00448 uint8_t *srcbis = src - (srcStride * 2); 00449 00450 const vec_u8 srcM2a = vec_ld(0, srcbis); 00451 const vec_u8 srcM2b = vec_ld(16, srcbis); 00452 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 00453 //srcbis += srcStride; 00454 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 00455 const vec_u8 srcM1b = vec_ld(16, srcbis); 00456 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 00457 //srcbis += srcStride; 00458 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 00459 const vec_u8 srcP0b = vec_ld(16, srcbis); 00460 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 00461 //srcbis += srcStride; 00462 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 00463 const vec_u8 srcP1b = vec_ld(16, srcbis); 00464 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 00465 //srcbis += srcStride; 00466 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 00467 const vec_u8 srcP2b = vec_ld(16, srcbis); 00468 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 00469 //srcbis += srcStride; 00470 00471 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00472 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 00473 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00474 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 00475 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00476 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 00477 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00478 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 00479 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00480 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 00481 00482 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00483 psumA, psumB, sumA, sumB, 00484 srcP3ssA, srcP3ssB, 00485 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 00486 00487 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 00488 00489 for (i = 0 ; i < 16 ; i++) { 00490 srcP3a = vec_ld(0, srcbis += srcStride); 00491 srcP3b = vec_ld(16, srcbis); 00492 srcP3 = vec_perm(srcP3a, srcP3b, perm); 00493 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00494 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 00495 //srcbis += srcStride; 00496 00497 sum1A = vec_adds(srcP0ssA, srcP1ssA); 00498 sum1B = vec_adds(srcP0ssB, srcP1ssB); 00499 sum2A = vec_adds(srcM1ssA, srcP2ssA); 00500 sum2B = vec_adds(srcM1ssB, srcP2ssB); 00501 sum3A = vec_adds(srcM2ssA, srcP3ssA); 00502 sum3B = vec_adds(srcM2ssB, srcP3ssB); 00503 00504 srcM2ssA = srcM1ssA; 00505 srcM2ssB = srcM1ssB; 00506 srcM1ssA = srcP0ssA; 00507 srcM1ssB = srcP0ssB; 00508 srcP0ssA = srcP1ssA; 00509 srcP0ssB = srcP1ssB; 00510 srcP1ssA = srcP2ssA; 00511 srcP1ssB = srcP2ssB; 00512 srcP2ssA = srcP3ssA; 00513 srcP2ssB = srcP3ssB; 00514 00515 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00516 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00517 00518 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00519 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00520 00521 pp3A = vec_add(sum3A, pp1A); 00522 pp3B = vec_add(sum3B, pp1B); 00523 00524 psumA = vec_sub(pp3A, pp2A); 00525 psumB = vec_sub(pp3B, pp2B); 00526 00527 sumA = vec_sra(psumA, v5us); 00528 sumB = vec_sra(psumB, v5us); 00529 00530 sum = vec_packsu(sumA, sumB); 00531 00532 ASSERT_ALIGNED(dst); 00533 vdst = vec_ld(0, dst); 00534 00535 OP_U8_ALTIVEC(fsum, sum, vdst); 00536 00537 vec_st(fsum, 0, dst); 00538 00539 dst += dstStride; 00540 } 00541 } 00542 #endif 00543 00544 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 00545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec 00546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 00547 register int i; 00548 LOAD_ZERO; 00549 const vec_u8 permM2 = vec_lvsl(-2, src); 00550 const vec_u8 permM1 = vec_lvsl(-1, src); 00551 const vec_u8 permP0 = vec_lvsl(+0, src); 00552 const vec_u8 permP1 = vec_lvsl(+1, src); 00553 const vec_u8 permP2 = vec_lvsl(+2, src); 00554 const vec_u8 permP3 = vec_lvsl(+3, src); 00555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00556 const vec_u32 v10ui = vec_splat_u32(10); 00557 const vec_s16 v5ss = vec_splat_s16(5); 00558 const vec_s16 v1ss = vec_splat_s16(1); 00559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 00560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 00561 00562 register int align = ((((unsigned long)src) - 2) % 16); 00563 00564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00565 srcP2A, srcP2B, srcP3A, srcP3B, 00566 srcM1A, srcM1B, srcM2A, srcM2B, 00567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00568 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 00569 00570 const vec_u8 mperm = (const vec_u8) 00571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 00572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 00573 int16_t *tmpbis = tmp; 00574 00575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 00576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 00577 tmpP2ssA, tmpP2ssB; 00578 00579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 00580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 00581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 00582 ssumAe, ssumAo, ssumBe, ssumBo; 00583 vec_u8 fsum, sumv, sum, vdst; 00584 vec_s16 ssume, ssumo; 00585 00586 src -= (2 * srcStride); 00587 for (i = 0 ; i < 21 ; i ++) { 00588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00589 vec_u8 srcR1 = vec_ld(-2, src); 00590 vec_u8 srcR2 = vec_ld(14, src); 00591 00592 switch (align) { 00593 default: { 00594 srcM2 = vec_perm(srcR1, srcR2, permM2); 00595 srcM1 = vec_perm(srcR1, srcR2, permM1); 00596 srcP0 = vec_perm(srcR1, srcR2, permP0); 00597 srcP1 = vec_perm(srcR1, srcR2, permP1); 00598 srcP2 = vec_perm(srcR1, srcR2, permP2); 00599 srcP3 = vec_perm(srcR1, srcR2, permP3); 00600 } break; 00601 case 11: { 00602 srcM2 = vec_perm(srcR1, srcR2, permM2); 00603 srcM1 = vec_perm(srcR1, srcR2, permM1); 00604 srcP0 = vec_perm(srcR1, srcR2, permP0); 00605 srcP1 = vec_perm(srcR1, srcR2, permP1); 00606 srcP2 = vec_perm(srcR1, srcR2, permP2); 00607 srcP3 = srcR2; 00608 } break; 00609 case 12: { 00610 vec_u8 srcR3 = vec_ld(30, src); 00611 srcM2 = vec_perm(srcR1, srcR2, permM2); 00612 srcM1 = vec_perm(srcR1, srcR2, permM1); 00613 srcP0 = vec_perm(srcR1, srcR2, permP0); 00614 srcP1 = vec_perm(srcR1, srcR2, permP1); 00615 srcP2 = srcR2; 00616 srcP3 = vec_perm(srcR2, srcR3, permP3); 00617 } break; 00618 case 13: { 00619 vec_u8 srcR3 = vec_ld(30, src); 00620 srcM2 = vec_perm(srcR1, srcR2, permM2); 00621 srcM1 = vec_perm(srcR1, srcR2, permM1); 00622 srcP0 = vec_perm(srcR1, srcR2, permP0); 00623 srcP1 = srcR2; 00624 srcP2 = vec_perm(srcR2, srcR3, permP2); 00625 srcP3 = vec_perm(srcR2, srcR3, permP3); 00626 } break; 00627 case 14: { 00628 vec_u8 srcR3 = vec_ld(30, src); 00629 srcM2 = vec_perm(srcR1, srcR2, permM2); 00630 srcM1 = vec_perm(srcR1, srcR2, permM1); 00631 srcP0 = srcR2; 00632 srcP1 = vec_perm(srcR2, srcR3, permP1); 00633 srcP2 = vec_perm(srcR2, srcR3, permP2); 00634 srcP3 = vec_perm(srcR2, srcR3, permP3); 00635 } break; 00636 case 15: { 00637 vec_u8 srcR3 = vec_ld(30, src); 00638 srcM2 = vec_perm(srcR1, srcR2, permM2); 00639 srcM1 = srcR2; 00640 srcP0 = vec_perm(srcR2, srcR3, permP0); 00641 srcP1 = vec_perm(srcR2, srcR3, permP1); 00642 srcP2 = vec_perm(srcR2, srcR3, permP2); 00643 srcP3 = vec_perm(srcR2, srcR3, permP3); 00644 } break; 00645 } 00646 00647 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00648 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00649 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00650 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00651 00652 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00653 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00654 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00655 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00656 00657 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00658 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00659 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00660 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00661 00662 sum1A = vec_adds(srcP0A, srcP1A); 00663 sum1B = vec_adds(srcP0B, srcP1B); 00664 sum2A = vec_adds(srcM1A, srcP2A); 00665 sum2B = vec_adds(srcM1B, srcP2B); 00666 sum3A = vec_adds(srcM2A, srcP3A); 00667 sum3B = vec_adds(srcM2B, srcP3B); 00668 00669 pp1A = vec_mladd(sum1A, v20ss, sum3A); 00670 pp1B = vec_mladd(sum1B, v20ss, sum3B); 00671 00672 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00673 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00674 00675 psumA = vec_sub(pp1A, pp2A); 00676 psumB = vec_sub(pp1B, pp2B); 00677 00678 vec_st(psumA, 0, tmp); 00679 vec_st(psumB, 16, tmp); 00680 00681 src += srcStride; 00682 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 00683 } 00684 00685 tmpM2ssA = vec_ld(0, tmpbis); 00686 tmpM2ssB = vec_ld(16, tmpbis); 00687 tmpbis += tmpStride; 00688 tmpM1ssA = vec_ld(0, tmpbis); 00689 tmpM1ssB = vec_ld(16, tmpbis); 00690 tmpbis += tmpStride; 00691 tmpP0ssA = vec_ld(0, tmpbis); 00692 tmpP0ssB = vec_ld(16, tmpbis); 00693 tmpbis += tmpStride; 00694 tmpP1ssA = vec_ld(0, tmpbis); 00695 tmpP1ssB = vec_ld(16, tmpbis); 00696 tmpbis += tmpStride; 00697 tmpP2ssA = vec_ld(0, tmpbis); 00698 tmpP2ssB = vec_ld(16, tmpbis); 00699 tmpbis += tmpStride; 00700 00701 for (i = 0 ; i < 16 ; i++) { 00702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 00703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 00704 00705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 00706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 00707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 00708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 00709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 00710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 00711 00712 tmpbis += tmpStride; 00713 00714 tmpM2ssA = tmpM1ssA; 00715 tmpM2ssB = tmpM1ssB; 00716 tmpM1ssA = tmpP0ssA; 00717 tmpM1ssB = tmpP0ssB; 00718 tmpP0ssA = tmpP1ssA; 00719 tmpP0ssB = tmpP1ssB; 00720 tmpP1ssA = tmpP2ssA; 00721 tmpP1ssB = tmpP2ssB; 00722 tmpP2ssA = tmpP3ssA; 00723 tmpP2ssB = tmpP3ssB; 00724 00725 pp1Ae = vec_mule(sum1A, v20ss); 00726 pp1Ao = vec_mulo(sum1A, v20ss); 00727 pp1Be = vec_mule(sum1B, v20ss); 00728 pp1Bo = vec_mulo(sum1B, v20ss); 00729 00730 pp2Ae = vec_mule(sum2A, v5ss); 00731 pp2Ao = vec_mulo(sum2A, v5ss); 00732 pp2Be = vec_mule(sum2B, v5ss); 00733 pp2Bo = vec_mulo(sum2B, v5ss); 00734 00735 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 00736 pp3Ao = vec_mulo(sum3A, v1ss); 00737 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 00738 pp3Bo = vec_mulo(sum3B, v1ss); 00739 00740 pp1cAe = vec_add(pp1Ae, v512si); 00741 pp1cAo = vec_add(pp1Ao, v512si); 00742 pp1cBe = vec_add(pp1Be, v512si); 00743 pp1cBo = vec_add(pp1Bo, v512si); 00744 00745 pp32Ae = vec_sub(pp3Ae, pp2Ae); 00746 pp32Ao = vec_sub(pp3Ao, pp2Ao); 00747 pp32Be = vec_sub(pp3Be, pp2Be); 00748 pp32Bo = vec_sub(pp3Bo, pp2Bo); 00749 00750 sumAe = vec_add(pp1cAe, pp32Ae); 00751 sumAo = vec_add(pp1cAo, pp32Ao); 00752 sumBe = vec_add(pp1cBe, pp32Be); 00753 sumBo = vec_add(pp1cBo, pp32Bo); 00754 00755 ssumAe = vec_sra(sumAe, v10ui); 00756 ssumAo = vec_sra(sumAo, v10ui); 00757 ssumBe = vec_sra(sumBe, v10ui); 00758 ssumBo = vec_sra(sumBo, v10ui); 00759 00760 ssume = vec_packs(ssumAe, ssumBe); 00761 ssumo = vec_packs(ssumAo, ssumBo); 00762 00763 sumv = vec_packsu(ssume, ssumo); 00764 sum = vec_perm(sumv, sumv, mperm); 00765 00766 ASSERT_ALIGNED(dst); 00767 vdst = vec_ld(0, dst); 00768 00769 OP_U8_ALTIVEC(fsum, sum, vdst); 00770 00771 vec_st(fsum, 0, dst); 00772 00773 dst += dstStride; 00774 } 00775 } 00776 #endif