Libav
|
00001 /* 00002 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) 00003 * 00004 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License 00019 * along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00028 /* 00029 C MMX MMX2 3DNow AltiVec 00030 isVertDC Ec Ec Ec 00031 isVertMinMaxOk Ec Ec Ec 00032 doVertLowPass E e e Ec 00033 doVertDefFilter Ec Ec e e Ec 00034 isHorizDC Ec Ec Ec 00035 isHorizMinMaxOk a E Ec 00036 doHorizLowPass E e e Ec 00037 doHorizDefFilter Ec Ec e e Ec 00038 do_a_deblock Ec E Ec E 00039 deRing E e e* Ecp 00040 Vertical RKAlgo1 E a a 00041 Horizontal RKAlgo1 a a 00042 Vertical X1# a E E 00043 Horizontal X1# a E E 00044 LinIpolDeinterlace e E E* 00045 CubicIpolDeinterlace a e e* 00046 LinBlendDeinterlace e E E* 00047 MedianDeinterlace# E Ec Ec 00048 TempDeNoiser# E e e Ec 00049 00050 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work 00051 # more or less selfinvented filters so the exactness is not too meaningful 00052 E = Exact implementation 00053 e = almost exact implementation (slightly different rounding,...) 00054 a = alternative / approximate impl 00055 c = checked against the other implementations (-vo md5) 00056 p = partially optimized, still some work to do 00057 */ 00058 00059 /* 00060 TODO: 00061 reduce the time wasted on the mem transfer 00062 unroll stuff if instructions depend too much on the prior one 00063 move YScale thing to the end instead of fixing QP 00064 write a faster and higher quality deblocking filter :) 00065 make the mainloop more flexible (variable number of blocks at once 00066 (the if/else stuff per block is slowing things down) 00067 compare the quality & speed of all filters 00068 split this huge file 00069 optimize c versions 00070 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 00071 ... 00072 */ 00073 00074 //Changelog: use the Subversion log 00075 00076 #include "config.h" 00077 #include "libavutil/avutil.h" 00078 #include <inttypes.h> 00079 #include <stdio.h> 00080 #include <stdlib.h> 00081 #include <string.h> 00082 //#undef HAVE_MMX2 00083 //#define HAVE_AMD3DNOW 00084 //#undef HAVE_MMX 00085 //#undef ARCH_X86 00086 //#define DEBUG_BRIGHTNESS 00087 #include "postprocess.h" 00088 #include "postprocess_internal.h" 00089 00090 unsigned postproc_version(void) 00091 { 00092 return LIBPOSTPROC_VERSION_INT; 00093 } 00094 00095 const char *postproc_configuration(void) 00096 { 00097 return FFMPEG_CONFIGURATION; 00098 } 00099 00100 const char *postproc_license(void) 00101 { 00102 #define LICENSE_PREFIX "libpostproc license: " 00103 return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1; 00104 } 00105 00106 #if HAVE_ALTIVEC_H 00107 #include <altivec.h> 00108 #endif 00109 00110 #define GET_MODE_BUFFER_SIZE 500 00111 #define OPTIONS_ARRAY_SIZE 10 00112 #define BLOCK_SIZE 8 00113 #define TEMP_STRIDE 8 00114 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet 00115 00116 #if ARCH_X86 00117 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; 00118 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL; 00119 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL; 00120 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL; 00121 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL; 00122 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL; 00123 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL; 00124 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL; 00125 #endif 00126 00127 DECLARE_ASM_CONST(8, int, deringThreshold)= 20; 00128 00129 00130 static struct PPFilter filters[]= 00131 { 00132 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, 00133 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, 00134 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, 00135 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ 00136 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, 00137 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, 00138 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK}, 00139 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK}, 00140 {"dr", "dering", 1, 5, 6, DERING}, 00141 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, 00142 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, 00143 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, 00144 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, 00145 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, 00146 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, 00147 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER}, 00148 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, 00149 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, 00150 {NULL, NULL,0,0,0,0} //End Marker 00151 }; 00152 00153 static const char *replaceTable[]= 00154 { 00155 "default", "hb:a,vb:a,dr:a", 00156 "de", "hb:a,vb:a,dr:a", 00157 "fast", "h1:a,v1:a,dr:a", 00158 "fa", "h1:a,v1:a,dr:a", 00159 "ac", "ha:a:128:7,va:a,dr:a", 00160 NULL //End Marker 00161 }; 00162 00163 00164 #if ARCH_X86 00165 static inline void prefetchnta(void *p) 00166 { 00167 __asm__ volatile( "prefetchnta (%0)\n\t" 00168 : : "r" (p) 00169 ); 00170 } 00171 00172 static inline void prefetcht0(void *p) 00173 { 00174 __asm__ volatile( "prefetcht0 (%0)\n\t" 00175 : : "r" (p) 00176 ); 00177 } 00178 00179 static inline void prefetcht1(void *p) 00180 { 00181 __asm__ volatile( "prefetcht1 (%0)\n\t" 00182 : : "r" (p) 00183 ); 00184 } 00185 00186 static inline void prefetcht2(void *p) 00187 { 00188 __asm__ volatile( "prefetcht2 (%0)\n\t" 00189 : : "r" (p) 00190 ); 00191 } 00192 #endif 00193 00194 /* The horizontal functions exist only in C because the MMX 00195 * code is faster with vertical filters and transposing. */ 00196 00200 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c) 00201 { 00202 int numEq= 0; 00203 int y; 00204 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00205 const int dcThreshold= dcOffset*2 + 1; 00206 00207 for(y=0; y<BLOCK_SIZE; y++){ 00208 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; 00209 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; 00210 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; 00211 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; 00212 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; 00213 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; 00214 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; 00215 src+= stride; 00216 } 00217 return numEq > c->ppMode.flatnessThreshold; 00218 } 00219 00223 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c) 00224 { 00225 int numEq= 0; 00226 int y; 00227 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00228 const int dcThreshold= dcOffset*2 + 1; 00229 00230 src+= stride*4; // src points to begin of the 8x8 Block 00231 for(y=0; y<BLOCK_SIZE-1; y++){ 00232 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; 00233 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; 00234 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; 00235 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; 00236 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; 00237 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; 00238 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; 00239 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; 00240 src+= stride; 00241 } 00242 return numEq > c->ppMode.flatnessThreshold; 00243 } 00244 00245 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP) 00246 { 00247 int i; 00248 #if 1 00249 for(i=0; i<2; i++){ 00250 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; 00251 src += stride; 00252 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; 00253 src += stride; 00254 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; 00255 src += stride; 00256 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; 00257 src += stride; 00258 } 00259 #else 00260 for(i=0; i<8; i++){ 00261 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0; 00262 src += stride; 00263 } 00264 #endif 00265 return 1; 00266 } 00267 00268 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP) 00269 { 00270 #if 1 00271 #if 1 00272 int x; 00273 src+= stride*4; 00274 for(x=0; x<BLOCK_SIZE; x+=4){ 00275 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; 00276 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; 00277 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; 00278 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; 00279 } 00280 #else 00281 int x; 00282 src+= stride*3; 00283 for(x=0; x<BLOCK_SIZE; x++){ 00284 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; 00285 } 00286 #endif 00287 return 1; 00288 #else 00289 int x; 00290 src+= stride*4; 00291 for(x=0; x<BLOCK_SIZE; x++){ 00292 int min=255; 00293 int max=0; 00294 int y; 00295 for(y=0; y<8; y++){ 00296 int v= src[x + y*stride]; 00297 if(v>max) max=v; 00298 if(v<min) min=v; 00299 } 00300 if(max-min > 2*QP) return 0; 00301 } 00302 return 1; 00303 #endif 00304 } 00305 00306 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c) 00307 { 00308 if( isHorizDC_C(src, stride, c) ){ 00309 if( isHorizMinMaxOk_C(src, stride, c->QP) ) 00310 return 1; 00311 else 00312 return 0; 00313 }else{ 00314 return 2; 00315 } 00316 } 00317 00318 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c) 00319 { 00320 if( isVertDC_C(src, stride, c) ){ 00321 if( isVertMinMaxOk_C(src, stride, c->QP) ) 00322 return 1; 00323 else 00324 return 0; 00325 }else{ 00326 return 2; 00327 } 00328 } 00329 00330 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c) 00331 { 00332 int y; 00333 for(y=0; y<BLOCK_SIZE; y++){ 00334 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); 00335 00336 if(FFABS(middleEnergy) < 8*c->QP){ 00337 const int q=(dst[3] - dst[4])/2; 00338 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); 00339 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); 00340 00341 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00342 d= FFMAX(d, 0); 00343 00344 d= (5*d + 32) >> 6; 00345 d*= FFSIGN(-middleEnergy); 00346 00347 if(q>0) 00348 { 00349 d= d<0 ? 0 : d; 00350 d= d>q ? q : d; 00351 } 00352 else 00353 { 00354 d= d>0 ? 0 : d; 00355 d= d<q ? q : d; 00356 } 00357 00358 dst[3]-= d; 00359 dst[4]+= d; 00360 } 00361 dst+= stride; 00362 } 00363 } 00364 00369 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c) 00370 { 00371 int y; 00372 for(y=0; y<BLOCK_SIZE; y++){ 00373 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; 00374 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; 00375 00376 int sums[10]; 00377 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; 00378 sums[1] = sums[0] - first + dst[3]; 00379 sums[2] = sums[1] - first + dst[4]; 00380 sums[3] = sums[2] - first + dst[5]; 00381 sums[4] = sums[3] - first + dst[6]; 00382 sums[5] = sums[4] - dst[0] + dst[7]; 00383 sums[6] = sums[5] - dst[1] + last; 00384 sums[7] = sums[6] - dst[2] + last; 00385 sums[8] = sums[7] - dst[3] + last; 00386 sums[9] = sums[8] - dst[4] + last; 00387 00388 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; 00389 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; 00390 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; 00391 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; 00392 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; 00393 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; 00394 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; 00395 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; 00396 00397 dst+= stride; 00398 } 00399 } 00400 00409 static inline void horizX1Filter(uint8_t *src, int stride, int QP) 00410 { 00411 int y; 00412 static uint64_t *lut= NULL; 00413 if(lut==NULL) 00414 { 00415 int i; 00416 lut = av_malloc(256*8); 00417 for(i=0; i<256; i++) 00418 { 00419 int v= i < 128 ? 2*i : 2*(i-256); 00420 /* 00421 //Simulate 112242211 9-Tap filter 00422 uint64_t a= (v/16) & 0xFF; 00423 uint64_t b= (v/8) & 0xFF; 00424 uint64_t c= (v/4) & 0xFF; 00425 uint64_t d= (3*v/8) & 0xFF; 00426 */ 00427 //Simulate piecewise linear interpolation 00428 uint64_t a= (v/16) & 0xFF; 00429 uint64_t b= (v*3/16) & 0xFF; 00430 uint64_t c= (v*5/16) & 0xFF; 00431 uint64_t d= (7*v/16) & 0xFF; 00432 uint64_t A= (0x100 - a)&0xFF; 00433 uint64_t B= (0x100 - b)&0xFF; 00434 uint64_t C= (0x100 - c)&0xFF; 00435 uint64_t D= (0x100 - c)&0xFF; 00436 00437 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | 00438 (D<<24) | (C<<16) | (B<<8) | (A); 00439 //lut[i] = (v<<32) | (v<<24); 00440 } 00441 } 00442 00443 for(y=0; y<BLOCK_SIZE; y++){ 00444 int a= src[1] - src[2]; 00445 int b= src[3] - src[4]; 00446 int c= src[5] - src[6]; 00447 00448 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); 00449 00450 if(d < QP){ 00451 int v = d * FFSIGN(-b); 00452 00453 src[1] +=v/8; 00454 src[2] +=v/4; 00455 src[3] +=3*v/8; 00456 src[4] -=3*v/8; 00457 src[5] -=v/4; 00458 src[6] -=v/8; 00459 } 00460 src+=stride; 00461 } 00462 } 00463 00467 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){ 00468 int y; 00469 const int QP= c->QP; 00470 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00471 const int dcThreshold= dcOffset*2 + 1; 00472 //START_TIMER 00473 src+= step*4; // src points to begin of the 8x8 Block 00474 for(y=0; y<8; y++){ 00475 int numEq= 0; 00476 00477 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++; 00478 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++; 00479 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++; 00480 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++; 00481 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++; 00482 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++; 00483 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++; 00484 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++; 00485 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++; 00486 if(numEq > c->ppMode.flatnessThreshold){ 00487 int min, max, x; 00488 00489 if(src[0] > src[step]){ 00490 max= src[0]; 00491 min= src[step]; 00492 }else{ 00493 max= src[step]; 00494 min= src[0]; 00495 } 00496 for(x=2; x<8; x+=2){ 00497 if(src[x*step] > src[(x+1)*step]){ 00498 if(src[x *step] > max) max= src[ x *step]; 00499 if(src[(x+1)*step] < min) min= src[(x+1)*step]; 00500 }else{ 00501 if(src[(x+1)*step] > max) max= src[(x+1)*step]; 00502 if(src[ x *step] < min) min= src[ x *step]; 00503 } 00504 } 00505 if(max-min < 2*QP){ 00506 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; 00507 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; 00508 00509 int sums[10]; 00510 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; 00511 sums[1] = sums[0] - first + src[3*step]; 00512 sums[2] = sums[1] - first + src[4*step]; 00513 sums[3] = sums[2] - first + src[5*step]; 00514 sums[4] = sums[3] - first + src[6*step]; 00515 sums[5] = sums[4] - src[0*step] + src[7*step]; 00516 sums[6] = sums[5] - src[1*step] + last; 00517 sums[7] = sums[6] - src[2*step] + last; 00518 sums[8] = sums[7] - src[3*step] + last; 00519 sums[9] = sums[8] - src[4*step] + last; 00520 00521 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; 00522 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; 00523 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; 00524 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; 00525 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; 00526 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; 00527 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; 00528 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; 00529 } 00530 }else{ 00531 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); 00532 00533 if(FFABS(middleEnergy) < 8*QP){ 00534 const int q=(src[3*step] - src[4*step])/2; 00535 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); 00536 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); 00537 00538 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00539 d= FFMAX(d, 0); 00540 00541 d= (5*d + 32) >> 6; 00542 d*= FFSIGN(-middleEnergy); 00543 00544 if(q>0){ 00545 d= d<0 ? 0 : d; 00546 d= d>q ? q : d; 00547 }else{ 00548 d= d>0 ? 0 : d; 00549 d= d<q ? q : d; 00550 } 00551 00552 src[3*step]-= d; 00553 src[4*step]+= d; 00554 } 00555 } 00556 00557 src += stride; 00558 } 00559 /*if(step==16){ 00560 STOP_TIMER("step16") 00561 }else{ 00562 STOP_TIMER("stepX") 00563 }*/ 00564 } 00565 00566 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one 00567 //Plain C versions 00568 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT 00569 #define COMPILE_C 00570 #endif 00571 00572 #if HAVE_ALTIVEC 00573 #define COMPILE_ALTIVEC 00574 #endif //HAVE_ALTIVEC 00575 00576 #if ARCH_X86 00577 00578 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00579 #define COMPILE_MMX 00580 #endif 00581 00582 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT 00583 #define COMPILE_MMX2 00584 #endif 00585 00586 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00587 #define COMPILE_3DNOW 00588 #endif 00589 #endif /* ARCH_X86 */ 00590 00591 #undef HAVE_MMX 00592 #define HAVE_MMX 0 00593 #undef HAVE_MMX2 00594 #define HAVE_MMX2 0 00595 #undef HAVE_AMD3DNOW 00596 #define HAVE_AMD3DNOW 0 00597 #undef HAVE_ALTIVEC 00598 #define HAVE_ALTIVEC 0 00599 00600 #ifdef COMPILE_C 00601 #define RENAME(a) a ## _C 00602 #include "postprocess_template.c" 00603 #endif 00604 00605 #ifdef COMPILE_ALTIVEC 00606 #undef RENAME 00607 #undef HAVE_ALTIVEC 00608 #define HAVE_ALTIVEC 1 00609 #define RENAME(a) a ## _altivec 00610 #include "postprocess_altivec_template.c" 00611 #include "postprocess_template.c" 00612 #endif 00613 00614 //MMX versions 00615 #ifdef COMPILE_MMX 00616 #undef RENAME 00617 #undef HAVE_MMX 00618 #define HAVE_MMX 1 00619 #define RENAME(a) a ## _MMX 00620 #include "postprocess_template.c" 00621 #endif 00622 00623 //MMX2 versions 00624 #ifdef COMPILE_MMX2 00625 #undef RENAME 00626 #undef HAVE_MMX 00627 #undef HAVE_MMX2 00628 #define HAVE_MMX 1 00629 #define HAVE_MMX2 1 00630 #define RENAME(a) a ## _MMX2 00631 #include "postprocess_template.c" 00632 #endif 00633 00634 //3DNOW versions 00635 #ifdef COMPILE_3DNOW 00636 #undef RENAME 00637 #undef HAVE_MMX 00638 #undef HAVE_MMX2 00639 #undef HAVE_AMD3DNOW 00640 #define HAVE_MMX 1 00641 #define HAVE_MMX2 0 00642 #define HAVE_AMD3DNOW 1 00643 #define RENAME(a) a ## _3DNow 00644 #include "postprocess_template.c" 00645 #endif 00646 00647 // minor note: the HAVE_xyz is messed up after that line so do not use it. 00648 00649 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00650 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc) 00651 { 00652 PPContext *c= (PPContext *)vc; 00653 PPMode *ppMode= (PPMode *)vm; 00654 c->ppMode= *ppMode; //FIXME 00655 00656 // Using ifs here as they are faster than function pointers although the 00657 // difference would not be measurable here but it is much better because 00658 // someone might exchange the CPU whithout restarting MPlayer ;) 00659 #if CONFIG_RUNTIME_CPUDETECT 00660 #if ARCH_X86 00661 // ordered per speed fastest first 00662 if(c->cpuCaps & PP_CPU_CAPS_MMX2) 00663 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00664 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) 00665 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00666 else if(c->cpuCaps & PP_CPU_CAPS_MMX) 00667 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00668 else 00669 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00670 #else 00671 #if HAVE_ALTIVEC 00672 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC) 00673 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00674 else 00675 #endif 00676 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00677 #endif 00678 #else //CONFIG_RUNTIME_CPUDETECT 00679 #if HAVE_MMX2 00680 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00681 #elif HAVE_AMD3DNOW 00682 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00683 #elif HAVE_MMX 00684 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00685 #elif HAVE_ALTIVEC 00686 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00687 #else 00688 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00689 #endif 00690 #endif //!CONFIG_RUNTIME_CPUDETECT 00691 } 00692 00693 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00694 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 00695 00696 /* -pp Command line Help 00697 */ 00698 #if LIBPOSTPROC_VERSION_INT < (52<<16) 00699 const char *const pp_help= 00700 #else 00701 const char pp_help[] = 00702 #endif 00703 "Available postprocessing filters:\n" 00704 "Filters Options\n" 00705 "short long name short long option Description\n" 00706 "* * a autoq CPU power dependent enabler\n" 00707 " c chrom chrominance filtering enabled\n" 00708 " y nochrom chrominance filtering disabled\n" 00709 " n noluma luma filtering disabled\n" 00710 "hb hdeblock (2 threshold) horizontal deblocking filter\n" 00711 " 1. difference factor: default=32, higher -> more deblocking\n" 00712 " 2. flatness threshold: default=39, lower -> more deblocking\n" 00713 " the h & v deblocking filters share these\n" 00714 " so you can't set different thresholds for h / v\n" 00715 "vb vdeblock (2 threshold) vertical deblocking filter\n" 00716 "ha hadeblock (2 threshold) horizontal deblocking filter\n" 00717 "va vadeblock (2 threshold) vertical deblocking filter\n" 00718 "h1 x1hdeblock experimental h deblock filter 1\n" 00719 "v1 x1vdeblock experimental v deblock filter 1\n" 00720 "dr dering deringing filter\n" 00721 "al autolevels automatic brightness / contrast\n" 00722 " f fullyrange stretch luminance to (0..255)\n" 00723 "lb linblenddeint linear blend deinterlacer\n" 00724 "li linipoldeint linear interpolating deinterlace\n" 00725 "ci cubicipoldeint cubic interpolating deinterlacer\n" 00726 "md mediandeint median deinterlacer\n" 00727 "fd ffmpegdeint ffmpeg deinterlacer\n" 00728 "l5 lowpass5 FIR lowpass deinterlacer\n" 00729 "de default hb:a,vb:a,dr:a\n" 00730 "fa fast h1:a,v1:a,dr:a\n" 00731 "ac ha:a:128:7,va:a,dr:a\n" 00732 "tn tmpnoise (3 threshold) temporal noise reducer\n" 00733 " 1. <= 2. <= 3. larger -> stronger filtering\n" 00734 "fq forceQuant <quantizer> force quantizer\n" 00735 "Usage:\n" 00736 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" 00737 "long form example:\n" 00738 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" 00739 "short form example:\n" 00740 "vb:a/hb:a/lb de,-vb\n" 00741 "more examples:\n" 00742 "tn:64:128:256\n" 00743 "\n" 00744 ; 00745 00746 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality) 00747 { 00748 char temp[GET_MODE_BUFFER_SIZE]; 00749 char *p= temp; 00750 static const char filterDelimiters[] = ",/"; 00751 static const char optionDelimiters[] = ":"; 00752 struct PPMode *ppMode; 00753 char *filterToken; 00754 00755 ppMode= av_malloc(sizeof(PPMode)); 00756 00757 ppMode->lumMode= 0; 00758 ppMode->chromMode= 0; 00759 ppMode->maxTmpNoise[0]= 700; 00760 ppMode->maxTmpNoise[1]= 1500; 00761 ppMode->maxTmpNoise[2]= 3000; 00762 ppMode->maxAllowedY= 234; 00763 ppMode->minAllowedY= 16; 00764 ppMode->baseDcDiff= 256/8; 00765 ppMode->flatnessThreshold= 56-16-1; 00766 ppMode->maxClippedThreshold= 0.01; 00767 ppMode->error=0; 00768 00769 strncpy(temp, name, GET_MODE_BUFFER_SIZE); 00770 00771 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name); 00772 00773 for(;;){ 00774 char *filterName; 00775 int q= 1000000; //PP_QUALITY_MAX; 00776 int chrom=-1; 00777 int luma=-1; 00778 char *option; 00779 char *options[OPTIONS_ARRAY_SIZE]; 00780 int i; 00781 int filterNameOk=0; 00782 int numOfUnknownOptions=0; 00783 int enable=1; //does the user want us to enabled or disabled the filter 00784 00785 filterToken= strtok(p, filterDelimiters); 00786 if(filterToken == NULL) break; 00787 p+= strlen(filterToken) + 1; // p points to next filterToken 00788 filterName= strtok(filterToken, optionDelimiters); 00789 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName); 00790 00791 if(*filterName == '-'){ 00792 enable=0; 00793 filterName++; 00794 } 00795 00796 for(;;){ //for all options 00797 option= strtok(NULL, optionDelimiters); 00798 if(option == NULL) break; 00799 00800 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option); 00801 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; 00802 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; 00803 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; 00804 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0; 00805 else{ 00806 options[numOfUnknownOptions] = option; 00807 numOfUnknownOptions++; 00808 } 00809 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; 00810 } 00811 options[numOfUnknownOptions] = NULL; 00812 00813 /* replace stuff from the replace Table */ 00814 for(i=0; replaceTable[2*i]!=NULL; i++){ 00815 if(!strcmp(replaceTable[2*i], filterName)){ 00816 int newlen= strlen(replaceTable[2*i + 1]); 00817 int plen; 00818 int spaceLeft; 00819 00820 if(p==NULL) p= temp, *p=0; //last filter 00821 else p--, *p=','; //not last filter 00822 00823 plen= strlen(p); 00824 spaceLeft= p - temp + plen; 00825 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE){ 00826 ppMode->error++; 00827 break; 00828 } 00829 memmove(p + newlen, p, plen+1); 00830 memcpy(p, replaceTable[2*i + 1], newlen); 00831 filterNameOk=1; 00832 } 00833 } 00834 00835 for(i=0; filters[i].shortName!=NULL; i++){ 00836 if( !strcmp(filters[i].longName, filterName) 00837 || !strcmp(filters[i].shortName, filterName)){ 00838 ppMode->lumMode &= ~filters[i].mask; 00839 ppMode->chromMode &= ~filters[i].mask; 00840 00841 filterNameOk=1; 00842 if(!enable) break; // user wants to disable it 00843 00844 if(q >= filters[i].minLumQuality && luma) 00845 ppMode->lumMode|= filters[i].mask; 00846 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) 00847 if(q >= filters[i].minChromQuality) 00848 ppMode->chromMode|= filters[i].mask; 00849 00850 if(filters[i].mask == LEVEL_FIX){ 00851 int o; 00852 ppMode->minAllowedY= 16; 00853 ppMode->maxAllowedY= 234; 00854 for(o=0; options[o]!=NULL; o++){ 00855 if( !strcmp(options[o],"fullyrange") 00856 ||!strcmp(options[o],"f")){ 00857 ppMode->minAllowedY= 0; 00858 ppMode->maxAllowedY= 255; 00859 numOfUnknownOptions--; 00860 } 00861 } 00862 } 00863 else if(filters[i].mask == TEMP_NOISE_FILTER) 00864 { 00865 int o; 00866 int numOfNoises=0; 00867 00868 for(o=0; options[o]!=NULL; o++){ 00869 char *tail; 00870 ppMode->maxTmpNoise[numOfNoises]= 00871 strtol(options[o], &tail, 0); 00872 if(tail!=options[o]){ 00873 numOfNoises++; 00874 numOfUnknownOptions--; 00875 if(numOfNoises >= 3) break; 00876 } 00877 } 00878 } 00879 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK 00880 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){ 00881 int o; 00882 00883 for(o=0; options[o]!=NULL && o<2; o++){ 00884 char *tail; 00885 int val= strtol(options[o], &tail, 0); 00886 if(tail==options[o]) break; 00887 00888 numOfUnknownOptions--; 00889 if(o==0) ppMode->baseDcDiff= val; 00890 else ppMode->flatnessThreshold= val; 00891 } 00892 } 00893 else if(filters[i].mask == FORCE_QUANT){ 00894 int o; 00895 ppMode->forcedQuant= 15; 00896 00897 for(o=0; options[o]!=NULL && o<1; o++){ 00898 char *tail; 00899 int val= strtol(options[o], &tail, 0); 00900 if(tail==options[o]) break; 00901 00902 numOfUnknownOptions--; 00903 ppMode->forcedQuant= val; 00904 } 00905 } 00906 } 00907 } 00908 if(!filterNameOk) ppMode->error++; 00909 ppMode->error += numOfUnknownOptions; 00910 } 00911 00912 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); 00913 if(ppMode->error){ 00914 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); 00915 av_free(ppMode); 00916 return NULL; 00917 } 00918 return ppMode; 00919 } 00920 00921 void pp_free_mode(pp_mode *mode){ 00922 av_free(mode); 00923 } 00924 00925 static void reallocAlign(void **p, int alignment, int size){ 00926 av_free(*p); 00927 *p= av_mallocz(size); 00928 } 00929 00930 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){ 00931 int mbWidth = (width+15)>>4; 00932 int mbHeight= (height+15)>>4; 00933 int i; 00934 00935 c->stride= stride; 00936 c->qpStride= qpStride; 00937 00938 reallocAlign((void **)&c->tempDst, 8, stride*24); 00939 reallocAlign((void **)&c->tempSrc, 8, stride*24); 00940 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); 00941 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); 00942 for(i=0; i<256; i++) 00943 c->yHistogram[i]= width*height/64*15/256; 00944 00945 for(i=0; i<3; i++){ 00946 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end. 00947 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024); 00948 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size 00949 } 00950 00951 reallocAlign((void **)&c->deintTemp, 8, 2*width+32); 00952 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00953 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00954 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); 00955 } 00956 00957 static const char * context_to_name(void * ptr) { 00958 return "postproc"; 00959 } 00960 00961 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL }; 00962 00963 pp_context *pp_get_context(int width, int height, int cpuCaps){ 00964 PPContext *c= av_malloc(sizeof(PPContext)); 00965 int stride= FFALIGN(width, 16); //assumed / will realloc if needed 00966 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed 00967 00968 memset(c, 0, sizeof(PPContext)); 00969 c->av_class = &av_codec_context_class; 00970 c->cpuCaps= cpuCaps; 00971 if(cpuCaps&PP_FORMAT){ 00972 c->hChromaSubSample= cpuCaps&0x3; 00973 c->vChromaSubSample= (cpuCaps>>4)&0x3; 00974 }else{ 00975 c->hChromaSubSample= 1; 00976 c->vChromaSubSample= 1; 00977 } 00978 00979 reallocBuffers(c, width, height, stride, qpStride); 00980 00981 c->frameNum=-1; 00982 00983 return c; 00984 } 00985 00986 void pp_free_context(void *vc){ 00987 PPContext *c = (PPContext*)vc; 00988 int i; 00989 00990 for(i=0; i<3; i++) av_free(c->tempBlurred[i]); 00991 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]); 00992 00993 av_free(c->tempBlocks); 00994 av_free(c->yHistogram); 00995 av_free(c->tempDst); 00996 av_free(c->tempSrc); 00997 av_free(c->deintTemp); 00998 av_free(c->stdQPTable); 00999 av_free(c->nonBQPTable); 01000 av_free(c->forcedQPTable); 01001 01002 memset(c, 0, sizeof(PPContext)); 01003 01004 av_free(c); 01005 } 01006 01007 void pp_postprocess(const uint8_t * src[3], const int srcStride[3], 01008 uint8_t * dst[3], const int dstStride[3], 01009 int width, int height, 01010 const QP_STORE_T *QP_store, int QPStride, 01011 pp_mode *vm, void *vc, int pict_type) 01012 { 01013 int mbWidth = (width+15)>>4; 01014 int mbHeight= (height+15)>>4; 01015 PPMode *mode = (PPMode*)vm; 01016 PPContext *c = (PPContext*)vc; 01017 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0])); 01018 int absQPStride = FFABS(QPStride); 01019 01020 // c->stride and c->QPStride are always positive 01021 if(c->stride < minStride || c->qpStride < absQPStride) 01022 reallocBuffers(c, width, height, 01023 FFMAX(minStride, c->stride), 01024 FFMAX(c->qpStride, absQPStride)); 01025 01026 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){ 01027 int i; 01028 QP_store= c->forcedQPTable; 01029 absQPStride = QPStride = 0; 01030 if(mode->lumMode & FORCE_QUANT) 01031 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant; 01032 else 01033 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1; 01034 } 01035 01036 if(pict_type & PP_PICT_TYPE_QP2){ 01037 int i; 01038 const int count= mbHeight * absQPStride; 01039 for(i=0; i<(count>>2); i++){ 01040 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F; 01041 } 01042 for(i<<=2; i<count; i++){ 01043 c->stdQPTable[i] = QP_store[i]>>1; 01044 } 01045 QP_store= c->stdQPTable; 01046 QPStride= absQPStride; 01047 } 01048 01049 if(0){ 01050 int x,y; 01051 for(y=0; y<mbHeight; y++){ 01052 for(x=0; x<mbWidth; x++){ 01053 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]); 01054 } 01055 av_log(c, AV_LOG_INFO, "\n"); 01056 } 01057 av_log(c, AV_LOG_INFO, "\n"); 01058 } 01059 01060 if((pict_type&7)!=3){ 01061 if (QPStride >= 0){ 01062 int i; 01063 const int count= mbHeight * QPStride; 01064 for(i=0; i<(count>>2); i++){ 01065 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F; 01066 } 01067 for(i<<=2; i<count; i++){ 01068 c->nonBQPTable[i] = QP_store[i] & 0x3F; 01069 } 01070 } else { 01071 int i,j; 01072 for(i=0; i<mbHeight; i++) { 01073 for(j=0; j<absQPStride; j++) { 01074 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F; 01075 } 01076 } 01077 } 01078 } 01079 01080 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n", 01081 mode->lumMode, mode->chromMode); 01082 01083 postProcess(src[0], srcStride[0], dst[0], dstStride[0], 01084 width, height, QP_store, QPStride, 0, mode, c); 01085 01086 width = (width )>>c->hChromaSubSample; 01087 height = (height)>>c->vChromaSubSample; 01088 01089 if(mode->chromMode){ 01090 postProcess(src[1], srcStride[1], dst[1], dstStride[1], 01091 width, height, QP_store, QPStride, 1, mode, c); 01092 postProcess(src[2], srcStride[2], dst[2], dstStride[2], 01093 width, height, QP_store, QPStride, 2, mode, c); 01094 } 01095 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){ 01096 linecpy(dst[1], src[1], height, srcStride[1]); 01097 linecpy(dst[2], src[2], height, srcStride[2]); 01098 }else{ 01099 int y; 01100 for(y=0; y<height; y++){ 01101 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); 01102 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); 01103 } 01104 } 01105 } 01106