Libav 0.7.1
|
00001 /* 00002 * AltiVec acceleration for colorspace conversion 00003 * 00004 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 /* 00024 Convert I420 YV12 to RGB in various formats, 00025 it rejects images that are not in 420 formats, 00026 it rejects images that don't have widths of multiples of 16, 00027 it rejects images that don't have heights of multiples of 2. 00028 Reject defers to C simulation code. 00029 00030 Lots of optimizations to be done here. 00031 00032 1. Need to fix saturation code. I just couldn't get it to fly with packs 00033 and adds, so we currently use max/min to clip. 00034 00035 2. The inefficient use of chroma loading needs a bit of brushing up. 00036 00037 3. Analysis of pipeline stalls needs to be done. Use shark to identify 00038 pipeline stalls. 00039 00040 00041 MODIFIED to calculate coeffs from currently selected color space. 00042 MODIFIED core to be a macro where you specify the output format. 00043 ADDED UYVY conversion which is never called due to some thing in swscale. 00044 CORRECTED algorithim selection to be strict on input formats. 00045 ADDED runtime detection of AltiVec. 00046 00047 ADDED altivec_yuv2packedX vertical scl + RGB converter 00048 00049 March 27,2004 00050 PERFORMANCE ANALYSIS 00051 00052 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo 00053 used as test. 00054 The AltiVec version uses 10% of the processor or ~100Mips for D1 video 00055 same sequence. 00056 00057 720 * 480 * 30 ~10MPS 00058 00059 so we have roughly 10 clocks per pixel. This is too high, something has 00060 to be wrong. 00061 00062 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the 00063 need for vec_min. 00064 00065 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have 00066 the input video frame, it was just decompressed so it probably resides in L1 00067 caches. However, we are creating the output video stream. This needs to use the 00068 DSTST instruction to optimize for the cache. We couple this with the fact that 00069 we are not going to be visiting the input buffer again so we mark it Least 00070 Recently Used. This shaves 25% of the processor cycles off. 00071 00072 Now memcpy is the largest mips consumer in the system, probably due 00073 to the inefficient X11 stuff. 00074 00075 GL libraries seem to be very slow on this machine 1.33Ghz PB running 00076 Jaguar, this is not the case for my 1Ghz PB. I thought it might be 00077 a versioning issue, however I have libGL.1.2.dylib for both 00078 machines. (We need to figure this out now.) 00079 00080 GL2 libraries work now with patch for RGB32. 00081 00082 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. 00083 00084 Integrated luma prescaling adjustment for saturation/contrast/brightness 00085 adjustment. 00086 */ 00087 00088 #include <stdio.h> 00089 #include <stdlib.h> 00090 #include <string.h> 00091 #include <inttypes.h> 00092 #include <assert.h> 00093 #include "config.h" 00094 #include "libswscale/rgb2rgb.h" 00095 #include "libswscale/swscale.h" 00096 #include "libswscale/swscale_internal.h" 00097 #include "libavutil/cpu.h" 00098 #include "yuv2rgb_altivec.h" 00099 00100 #undef PROFILE_THE_BEAST 00101 #undef INC_SCALING 00102 00103 typedef unsigned char ubyte; 00104 typedef signed char sbyte; 00105 00106 00107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in 00108 homogeneous vector registers x0,x1,x2 are interleaved with the 00109 following technique: 00110 00111 o0 = vec_mergeh (x0,x1); 00112 o1 = vec_perm (o0, x2, perm_rgb_0); 00113 o2 = vec_perm (o0, x2, perm_rgb_1); 00114 o3 = vec_mergel (x0,x1); 00115 o4 = vec_perm (o3,o2,perm_rgb_2); 00116 o5 = vec_perm (o3,o2,perm_rgb_3); 00117 00118 perm_rgb_0: o0(RG).h v1(B) --> o1* 00119 0 1 2 3 4 00120 rgbr|gbrg|brgb|rgbr 00121 0010 0100 1001 0010 00122 0102 3145 2673 894A 00123 00124 perm_rgb_1: o0(RG).h v1(B) --> o2 00125 0 1 2 3 4 00126 gbrg|brgb|bbbb|bbbb 00127 0100 1001 1111 1111 00128 B5CD 6EF7 89AB CDEF 00129 00130 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* 00131 0 1 2 3 4 00132 gbrg|brgb|rgbr|gbrg 00133 1111 1111 0010 0100 00134 89AB CDEF 0182 3945 00135 00136 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* 00137 0 1 2 3 4 00138 brgb|rgbr|gbrg|brgb 00139 1001 0010 0100 1001 00140 a67b 89cA BdCD eEFf 00141 00142 */ 00143 static 00144 const vector unsigned char 00145 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, 00146 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a}, 00147 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, 00148 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}, 00149 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 00150 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05}, 00151 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, 00152 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f}; 00153 00154 #define vec_merge3(x2,x1,x0,y0,y1,y2) \ 00155 do { \ 00156 __typeof__(x0) o0,o2,o3; \ 00157 o0 = vec_mergeh (x0,x1); \ 00158 y0 = vec_perm (o0, x2, perm_rgb_0); \ 00159 o2 = vec_perm (o0, x2, perm_rgb_1); \ 00160 o3 = vec_mergel (x0,x1); \ 00161 y1 = vec_perm (o3,o2,perm_rgb_2); \ 00162 y2 = vec_perm (o3,o2,perm_rgb_3); \ 00163 } while(0) 00164 00165 #define vec_mstbgr24(x0,x1,x2,ptr) \ 00166 do { \ 00167 __typeof__(x0) _0,_1,_2; \ 00168 vec_merge3 (x0,x1,x2,_0,_1,_2); \ 00169 vec_st (_0, 0, ptr++); \ 00170 vec_st (_1, 0, ptr++); \ 00171 vec_st (_2, 0, ptr++); \ 00172 } while (0) 00173 00174 #define vec_mstrgb24(x0,x1,x2,ptr) \ 00175 do { \ 00176 __typeof__(x0) _0,_1,_2; \ 00177 vec_merge3 (x2,x1,x0,_0,_1,_2); \ 00178 vec_st (_0, 0, ptr++); \ 00179 vec_st (_1, 0, ptr++); \ 00180 vec_st (_2, 0, ptr++); \ 00181 } while (0) 00182 00183 /* pack the pixels in rgb0 format 00184 msb R 00185 lsb 0 00186 */ 00187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ 00188 do { \ 00189 T _0,_1,_2,_3; \ 00190 _0 = vec_mergeh (x0,x1); \ 00191 _1 = vec_mergeh (x2,x3); \ 00192 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 00193 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 00194 vec_st (_2, 0*16, (T *)ptr); \ 00195 vec_st (_3, 1*16, (T *)ptr); \ 00196 _0 = vec_mergel (x0,x1); \ 00197 _1 = vec_mergel (x2,x3); \ 00198 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 00199 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 00200 vec_st (_2, 2*16, (T *)ptr); \ 00201 vec_st (_3, 3*16, (T *)ptr); \ 00202 ptr += 4; \ 00203 } while (0) 00204 00205 /* 00206 00207 | 1 0 1.4021 | | Y | 00208 | 1 -0.3441 -0.7142 |x| Cb| 00209 | 1 1.7718 0 | | Cr| 00210 00211 00212 Y: [-128 127] 00213 Cb/Cr : [-128 127] 00214 00215 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. 00216 00217 */ 00218 00219 00220 00221 00222 #define vec_unh(x) \ 00223 (vector signed short) \ 00224 vec_perm(x,(__typeof__(x)){0}, \ 00225 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ 00226 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07})) 00227 #define vec_unl(x) \ 00228 (vector signed short) \ 00229 vec_perm(x,(__typeof__(x)){0}, \ 00230 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ 00231 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F})) 00232 00233 #define vec_clip_s16(x) \ 00234 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \ 00235 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16})) 00236 00237 #define vec_packclp(x,y) \ 00238 (vector unsigned char)vec_packs \ 00239 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \ 00240 (vector unsigned short)vec_max (y,((vector signed short) {0}))) 00241 00242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr) 00243 00244 00245 static inline void cvtyuvtoRGB (SwsContext *c, 00246 vector signed short Y, vector signed short U, vector signed short V, 00247 vector signed short *R, vector signed short *G, vector signed short *B) 00248 { 00249 vector signed short vx,ux,uvx; 00250 00251 Y = vec_mradds (Y, c->CY, c->OY); 00252 U = vec_sub (U,(vector signed short) 00253 vec_splat((vector signed short){128},0)); 00254 V = vec_sub (V,(vector signed short) 00255 vec_splat((vector signed short){128},0)); 00256 00257 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; 00258 ux = vec_sl (U, c->CSHIFT); 00259 *B = vec_mradds (ux, c->CBU, Y); 00260 00261 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; 00262 vx = vec_sl (V, c->CSHIFT); 00263 *R = vec_mradds (vx, c->CRV, Y); 00264 00265 // uvx = ((CGU*u) + (CGV*v))>>15; 00266 uvx = vec_mradds (U, c->CGU, Y); 00267 *G = vec_mradds (V, c->CGV, uvx); 00268 } 00269 00270 00271 /* 00272 ------------------------------------------------------------------------------ 00273 CS converters 00274 ------------------------------------------------------------------------------ 00275 */ 00276 00277 00278 #define DEFCSP420_CVT(name,out_pixels) \ 00279 static int altivec_##name (SwsContext *c, \ 00280 const unsigned char **in, int *instrides, \ 00281 int srcSliceY, int srcSliceH, \ 00282 unsigned char **oplanes, int *outstrides) \ 00283 { \ 00284 int w = c->srcW; \ 00285 int h = srcSliceH; \ 00286 int i,j; \ 00287 int instrides_scl[3]; \ 00288 vector unsigned char y0,y1; \ 00289 \ 00290 vector signed char u,v; \ 00291 \ 00292 vector signed short Y0,Y1,Y2,Y3; \ 00293 vector signed short U,V; \ 00294 vector signed short vx,ux,uvx; \ 00295 vector signed short vx0,ux0,uvx0; \ 00296 vector signed short vx1,ux1,uvx1; \ 00297 vector signed short R0,G0,B0; \ 00298 vector signed short R1,G1,B1; \ 00299 vector unsigned char R,G,B; \ 00300 \ 00301 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ 00302 vector unsigned char align_perm; \ 00303 \ 00304 vector signed short \ 00305 lCY = c->CY, \ 00306 lOY = c->OY, \ 00307 lCRV = c->CRV, \ 00308 lCBU = c->CBU, \ 00309 lCGU = c->CGU, \ 00310 lCGV = c->CGV; \ 00311 \ 00312 vector unsigned short lCSHIFT = c->CSHIFT; \ 00313 \ 00314 const ubyte *y1i = in[0]; \ 00315 const ubyte *y2i = in[0]+instrides[0]; \ 00316 const ubyte *ui = in[1]; \ 00317 const ubyte *vi = in[2]; \ 00318 \ 00319 vector unsigned char *oute \ 00320 = (vector unsigned char *) \ 00321 (oplanes[0]+srcSliceY*outstrides[0]); \ 00322 vector unsigned char *outo \ 00323 = (vector unsigned char *) \ 00324 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ 00325 \ 00326 \ 00327 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ 00328 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ 00329 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ 00330 \ 00331 \ 00332 for (i=0;i<h/2;i++) { \ 00333 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ 00334 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ 00335 \ 00336 for (j=0;j<w/16;j++) { \ 00337 \ 00338 y1ivP = (vector unsigned char *)y1i; \ 00339 y2ivP = (vector unsigned char *)y2i; \ 00340 uivP = (vector unsigned char *)ui; \ 00341 vivP = (vector unsigned char *)vi; \ 00342 \ 00343 align_perm = vec_lvsl (0, y1i); \ 00344 y0 = (vector unsigned char) \ 00345 vec_perm (y1ivP[0], y1ivP[1], align_perm); \ 00346 \ 00347 align_perm = vec_lvsl (0, y2i); \ 00348 y1 = (vector unsigned char) \ 00349 vec_perm (y2ivP[0], y2ivP[1], align_perm); \ 00350 \ 00351 align_perm = vec_lvsl (0, ui); \ 00352 u = (vector signed char) \ 00353 vec_perm (uivP[0], uivP[1], align_perm); \ 00354 \ 00355 align_perm = vec_lvsl (0, vi); \ 00356 v = (vector signed char) \ 00357 vec_perm (vivP[0], vivP[1], align_perm); \ 00358 \ 00359 u = (vector signed char) \ 00360 vec_sub (u,(vector signed char) \ 00361 vec_splat((vector signed char){128},0)); \ 00362 v = (vector signed char) \ 00363 vec_sub (v,(vector signed char) \ 00364 vec_splat((vector signed char){128},0)); \ 00365 \ 00366 U = vec_unpackh (u); \ 00367 V = vec_unpackh (v); \ 00368 \ 00369 \ 00370 Y0 = vec_unh (y0); \ 00371 Y1 = vec_unl (y0); \ 00372 Y2 = vec_unh (y1); \ 00373 Y3 = vec_unl (y1); \ 00374 \ 00375 Y0 = vec_mradds (Y0, lCY, lOY); \ 00376 Y1 = vec_mradds (Y1, lCY, lOY); \ 00377 Y2 = vec_mradds (Y2, lCY, lOY); \ 00378 Y3 = vec_mradds (Y3, lCY, lOY); \ 00379 \ 00380 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ 00381 ux = vec_sl (U, lCSHIFT); \ 00382 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \ 00383 ux0 = vec_mergeh (ux,ux); \ 00384 ux1 = vec_mergel (ux,ux); \ 00385 \ 00386 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ 00387 vx = vec_sl (V, lCSHIFT); \ 00388 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \ 00389 vx0 = vec_mergeh (vx,vx); \ 00390 vx1 = vec_mergel (vx,vx); \ 00391 \ 00392 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ 00393 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \ 00394 uvx = vec_mradds (V, lCGV, uvx); \ 00395 uvx0 = vec_mergeh (uvx,uvx); \ 00396 uvx1 = vec_mergel (uvx,uvx); \ 00397 \ 00398 R0 = vec_add (Y0,vx0); \ 00399 G0 = vec_add (Y0,uvx0); \ 00400 B0 = vec_add (Y0,ux0); \ 00401 R1 = vec_add (Y1,vx1); \ 00402 G1 = vec_add (Y1,uvx1); \ 00403 B1 = vec_add (Y1,ux1); \ 00404 \ 00405 R = vec_packclp (R0,R1); \ 00406 G = vec_packclp (G0,G1); \ 00407 B = vec_packclp (B0,B1); \ 00408 \ 00409 out_pixels(R,G,B,oute); \ 00410 \ 00411 R0 = vec_add (Y2,vx0); \ 00412 G0 = vec_add (Y2,uvx0); \ 00413 B0 = vec_add (Y2,ux0); \ 00414 R1 = vec_add (Y3,vx1); \ 00415 G1 = vec_add (Y3,uvx1); \ 00416 B1 = vec_add (Y3,ux1); \ 00417 R = vec_packclp (R0,R1); \ 00418 G = vec_packclp (G0,G1); \ 00419 B = vec_packclp (B0,B1); \ 00420 \ 00421 \ 00422 out_pixels(R,G,B,outo); \ 00423 \ 00424 y1i += 16; \ 00425 y2i += 16; \ 00426 ui += 8; \ 00427 vi += 8; \ 00428 \ 00429 } \ 00430 \ 00431 outo += (outstrides[0])>>4; \ 00432 oute += (outstrides[0])>>4; \ 00433 \ 00434 ui += instrides_scl[1]; \ 00435 vi += instrides_scl[2]; \ 00436 y1i += instrides_scl[0]; \ 00437 y2i += instrides_scl[0]; \ 00438 } \ 00439 return srcSliceH; \ 00440 } 00441 00442 00443 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr) 00444 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr) 00445 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr) 00446 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr) 00447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) 00448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) 00449 00450 DEFCSP420_CVT (yuv2_abgr, out_abgr) 00451 DEFCSP420_CVT (yuv2_bgra, out_bgra) 00452 DEFCSP420_CVT (yuv2_rgba, out_rgba) 00453 DEFCSP420_CVT (yuv2_argb, out_argb) 00454 DEFCSP420_CVT (yuv2_rgb24, out_rgb24) 00455 DEFCSP420_CVT (yuv2_bgr24, out_bgr24) 00456 00457 00458 // uyvy|uyvy|uyvy|uyvy 00459 // 0123 4567 89ab cdef 00460 static 00461 const vector unsigned char 00462 demux_u = {0x10,0x00,0x10,0x00, 00463 0x10,0x04,0x10,0x04, 00464 0x10,0x08,0x10,0x08, 00465 0x10,0x0c,0x10,0x0c}, 00466 demux_v = {0x10,0x02,0x10,0x02, 00467 0x10,0x06,0x10,0x06, 00468 0x10,0x0A,0x10,0x0A, 00469 0x10,0x0E,0x10,0x0E}, 00470 demux_y = {0x10,0x01,0x10,0x03, 00471 0x10,0x05,0x10,0x07, 00472 0x10,0x09,0x10,0x0B, 00473 0x10,0x0D,0x10,0x0F}; 00474 00475 /* 00476 this is so I can play live CCIR raw video 00477 */ 00478 static int altivec_uyvy_rgb32 (SwsContext *c, 00479 const unsigned char **in, int *instrides, 00480 int srcSliceY, int srcSliceH, 00481 unsigned char **oplanes, int *outstrides) 00482 { 00483 int w = c->srcW; 00484 int h = srcSliceH; 00485 int i,j; 00486 vector unsigned char uyvy; 00487 vector signed short Y,U,V; 00488 vector signed short R0,G0,B0,R1,G1,B1; 00489 vector unsigned char R,G,B; 00490 vector unsigned char *out; 00491 const ubyte *img; 00492 00493 img = in[0]; 00494 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); 00495 00496 for (i=0;i<h;i++) { 00497 for (j=0;j<w/16;j++) { 00498 uyvy = vec_ld (0, img); 00499 U = (vector signed short) 00500 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 00501 00502 V = (vector signed short) 00503 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 00504 00505 Y = (vector signed short) 00506 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 00507 00508 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); 00509 00510 uyvy = vec_ld (16, img); 00511 U = (vector signed short) 00512 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 00513 00514 V = (vector signed short) 00515 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 00516 00517 Y = (vector signed short) 00518 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 00519 00520 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); 00521 00522 R = vec_packclp (R0,R1); 00523 G = vec_packclp (G0,G1); 00524 B = vec_packclp (B0,B1); 00525 00526 // vec_mstbgr24 (R,G,B, out); 00527 out_rgba (R,G,B,out); 00528 00529 img += 32; 00530 } 00531 } 00532 return srcSliceH; 00533 } 00534 00535 00536 00537 /* Ok currently the acceleration routine only supports 00538 inputs of widths a multiple of 16 00539 and heights a multiple 2 00540 00541 So we just fall back to the C codes for this. 00542 */ 00543 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c) 00544 { 00545 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) 00546 return NULL; 00547 00548 /* 00549 and this seems not to matter too much I tried a bunch of 00550 videos with abnormal widths and MPlayer crashes elsewhere. 00551 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 00552 boom with X11 bad match. 00553 00554 */ 00555 if ((c->srcW & 0xf) != 0) return NULL; 00556 00557 switch (c->srcFormat) { 00558 case PIX_FMT_YUV410P: 00559 case PIX_FMT_YUV420P: 00560 /*case IMGFMT_CLPL: ??? */ 00561 case PIX_FMT_GRAY8: 00562 case PIX_FMT_NV12: 00563 case PIX_FMT_NV21: 00564 if ((c->srcH & 0x1) != 0) 00565 return NULL; 00566 00567 switch(c->dstFormat) { 00568 case PIX_FMT_RGB24: 00569 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); 00570 return altivec_yuv2_rgb24; 00571 case PIX_FMT_BGR24: 00572 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); 00573 return altivec_yuv2_bgr24; 00574 case PIX_FMT_ARGB: 00575 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); 00576 return altivec_yuv2_argb; 00577 case PIX_FMT_ABGR: 00578 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); 00579 return altivec_yuv2_abgr; 00580 case PIX_FMT_RGBA: 00581 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); 00582 return altivec_yuv2_rgba; 00583 case PIX_FMT_BGRA: 00584 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); 00585 return altivec_yuv2_bgra; 00586 default: return NULL; 00587 } 00588 break; 00589 00590 case PIX_FMT_UYVY422: 00591 switch(c->dstFormat) { 00592 case PIX_FMT_BGR32: 00593 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); 00594 return altivec_uyvy_rgb32; 00595 default: return NULL; 00596 } 00597 break; 00598 00599 } 00600 return NULL; 00601 } 00602 00603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation) 00604 { 00605 union { 00606 DECLARE_ALIGNED(16, signed short, tmp)[8]; 00607 vector signed short vec; 00608 } buf; 00609 00610 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy 00611 buf.tmp[1] = -256*brightness; //oy 00612 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv 00613 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu 00614 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu 00615 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv 00616 00617 00618 c->CSHIFT = (vector unsigned short)vec_splat_u16(2); 00619 c->CY = vec_splat ((vector signed short)buf.vec, 0); 00620 c->OY = vec_splat ((vector signed short)buf.vec, 1); 00621 c->CRV = vec_splat ((vector signed short)buf.vec, 2); 00622 c->CBU = vec_splat ((vector signed short)buf.vec, 3); 00623 c->CGU = vec_splat ((vector signed short)buf.vec, 4); 00624 c->CGV = vec_splat ((vector signed short)buf.vec, 5); 00625 return; 00626 } 00627 00628 00629 void 00630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter, 00631 const int16_t **lumSrc, int lumFilterSize, 00632 const int16_t *chrFilter, const int16_t **chrUSrc, 00633 const int16_t **chrVSrc, int chrFilterSize, 00634 const int16_t **alpSrc, uint8_t *dest, 00635 int dstW, int dstY) 00636 { 00637 int i,j; 00638 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; 00639 vector signed short R0,G0,B0,R1,G1,B1; 00640 00641 vector unsigned char R,G,B; 00642 vector unsigned char *out,*nout; 00643 00644 vector signed short RND = vec_splat_s16(1<<3); 00645 vector unsigned short SCL = vec_splat_u16(4); 00646 DECLARE_ALIGNED(16, unsigned int, scratch)[16]; 00647 00648 vector signed short *YCoeffs, *CCoeffs; 00649 00650 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; 00651 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; 00652 00653 out = (vector unsigned char *)dest; 00654 00655 for (i=0; i<dstW; i+=16) { 00656 Y0 = RND; 00657 Y1 = RND; 00658 /* extract 16 coeffs from lumSrc */ 00659 for (j=0; j<lumFilterSize; j++) { 00660 X0 = vec_ld (0, &lumSrc[j][i]); 00661 X1 = vec_ld (16, &lumSrc[j][i]); 00662 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 00663 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 00664 } 00665 00666 U = RND; 00667 V = RND; 00668 /* extract 8 coeffs from U,V */ 00669 for (j=0; j<chrFilterSize; j++) { 00670 X = vec_ld (0, &chrUSrc[j][i/2]); 00671 U = vec_mradds (X, CCoeffs[j], U); 00672 X = vec_ld (0, &chrVSrc[j][i/2]); 00673 V = vec_mradds (X, CCoeffs[j], V); 00674 } 00675 00676 /* scale and clip signals */ 00677 Y0 = vec_sra (Y0, SCL); 00678 Y1 = vec_sra (Y1, SCL); 00679 U = vec_sra (U, SCL); 00680 V = vec_sra (V, SCL); 00681 00682 Y0 = vec_clip_s16 (Y0); 00683 Y1 = vec_clip_s16 (Y1); 00684 U = vec_clip_s16 (U); 00685 V = vec_clip_s16 (V); 00686 00687 /* now we have 00688 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00689 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 00690 00691 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00692 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 00693 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 00694 */ 00695 00696 U0 = vec_mergeh (U,U); 00697 V0 = vec_mergeh (V,V); 00698 00699 U1 = vec_mergel (U,U); 00700 V1 = vec_mergel (V,V); 00701 00702 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 00703 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 00704 00705 R = vec_packclp (R0,R1); 00706 G = vec_packclp (G0,G1); 00707 B = vec_packclp (B0,B1); 00708 00709 switch(c->dstFormat) { 00710 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; 00711 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; 00712 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; 00713 case PIX_FMT_ARGB: out_argb (R,G,B,out); break; 00714 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; 00715 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; 00716 default: 00717 { 00718 /* If this is reached, the caller should have called yuv2packedXinC 00719 instead. */ 00720 static int printed_error_message; 00721 if (!printed_error_message) { 00722 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 00723 sws_format_name(c->dstFormat)); 00724 printed_error_message=1; 00725 } 00726 return; 00727 } 00728 } 00729 } 00730 00731 if (i < dstW) { 00732 i -= 16; 00733 00734 Y0 = RND; 00735 Y1 = RND; 00736 /* extract 16 coeffs from lumSrc */ 00737 for (j=0; j<lumFilterSize; j++) { 00738 X0 = vec_ld (0, &lumSrc[j][i]); 00739 X1 = vec_ld (16, &lumSrc[j][i]); 00740 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 00741 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 00742 } 00743 00744 U = RND; 00745 V = RND; 00746 /* extract 8 coeffs from U,V */ 00747 for (j=0; j<chrFilterSize; j++) { 00748 X = vec_ld (0, &chrUSrc[j][i/2]); 00749 U = vec_mradds (X, CCoeffs[j], U); 00750 X = vec_ld (0, &chrVSrc[j][i/2]); 00751 V = vec_mradds (X, CCoeffs[j], V); 00752 } 00753 00754 /* scale and clip signals */ 00755 Y0 = vec_sra (Y0, SCL); 00756 Y1 = vec_sra (Y1, SCL); 00757 U = vec_sra (U, SCL); 00758 V = vec_sra (V, SCL); 00759 00760 Y0 = vec_clip_s16 (Y0); 00761 Y1 = vec_clip_s16 (Y1); 00762 U = vec_clip_s16 (U); 00763 V = vec_clip_s16 (V); 00764 00765 /* now we have 00766 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00767 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 00768 00769 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00770 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 00771 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 00772 */ 00773 00774 U0 = vec_mergeh (U,U); 00775 V0 = vec_mergeh (V,V); 00776 00777 U1 = vec_mergel (U,U); 00778 V1 = vec_mergel (V,V); 00779 00780 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 00781 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 00782 00783 R = vec_packclp (R0,R1); 00784 G = vec_packclp (G0,G1); 00785 B = vec_packclp (B0,B1); 00786 00787 nout = (vector unsigned char *)scratch; 00788 switch(c->dstFormat) { 00789 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; 00790 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; 00791 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; 00792 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; 00793 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; 00794 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; 00795 default: 00796 /* Unreachable, I think. */ 00797 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 00798 sws_format_name(c->dstFormat)); 00799 return; 00800 } 00801 00802 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); 00803 } 00804 00805 }