Libav 0.7.1
|
00001 /* 00002 * (c) 2001 Fabrice Bellard 00003 * 2007 Marc Hoffman <marc.hoffman@analog.com> 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 #include <stdlib.h> 00029 #include <stdio.h> 00030 #include <string.h> 00031 #include <sys/time.h> 00032 #include <unistd.h> 00033 #include <math.h> 00034 00035 #include "libavutil/cpu.h" 00036 #include "libavutil/common.h" 00037 #include "libavutil/lfg.h" 00038 00039 #include "simple_idct.h" 00040 #include "aandcttab.h" 00041 #include "faandct.h" 00042 #include "faanidct.h" 00043 #include "x86/idct_xvid.h" 00044 #include "dctref.h" 00045 00046 #undef printf 00047 00048 void ff_mmx_idct(DCTELEM *data); 00049 void ff_mmxext_idct(DCTELEM *data); 00050 00051 void odivx_idct_c(short *block); 00052 00053 // BFIN 00054 void ff_bfin_idct(DCTELEM *block); 00055 void ff_bfin_fdct(DCTELEM *block); 00056 00057 // ALTIVEC 00058 void fdct_altivec(DCTELEM *block); 00059 //void idct_altivec(DCTELEM *block);?? no routine 00060 00061 // ARM 00062 void ff_j_rev_dct_arm(DCTELEM *data); 00063 void ff_simple_idct_arm(DCTELEM *data); 00064 void ff_simple_idct_armv5te(DCTELEM *data); 00065 void ff_simple_idct_armv6(DCTELEM *data); 00066 void ff_simple_idct_neon(DCTELEM *data); 00067 00068 void ff_simple_idct_axp(DCTELEM *data); 00069 00070 struct algo { 00071 const char *name; 00072 enum { FDCT, IDCT } is_idct; 00073 void (* func) (DCTELEM *block); 00074 void (* ref) (DCTELEM *block); 00075 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format; 00076 int mm_support; 00077 }; 00078 00079 #ifndef FAAN_POSTSCALE 00080 #define FAAN_SCALE SCALE_PERM 00081 #else 00082 #define FAAN_SCALE NO_PERM 00083 #endif 00084 00085 static int cpu_flags; 00086 00087 struct algo algos[] = { 00088 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM}, 00089 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE}, 00090 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM}, 00091 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM}, 00092 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM}, 00093 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM}, 00094 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM}, 00095 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM}, 00096 00097 #if HAVE_MMX 00098 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX}, 00099 #if HAVE_MMX2 00100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2}, 00101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2}, 00102 #endif 00103 00104 #if CONFIG_GPL 00105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX}, 00106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2}, 00107 #endif 00108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX}, 00109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX}, 00110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2}, 00111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2}, 00112 #endif 00113 00114 #if HAVE_ALTIVEC 00115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC}, 00116 #endif 00117 00118 #if ARCH_BFIN 00119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM}, 00120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM}, 00121 #endif 00122 00123 #if ARCH_ARM 00124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM }, 00125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM }, 00126 #if HAVE_ARMV5TE 00127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM }, 00128 #endif 00129 #if HAVE_ARMV6 00130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM }, 00131 #endif 00132 #if HAVE_NEON 00133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM }, 00134 #endif 00135 #endif /* ARCH_ARM */ 00136 00137 #if ARCH_ALPHA 00138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM }, 00139 #endif 00140 00141 { 0 } 00142 }; 00143 00144 #define AANSCALE_BITS 12 00145 00146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; 00147 00148 static int64_t gettime(void) 00149 { 00150 struct timeval tv; 00151 gettimeofday(&tv,NULL); 00152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 00153 } 00154 00155 #define NB_ITS 20000 00156 #define NB_ITS_SPEED 50000 00157 00158 static short idct_mmx_perm[64]; 00159 00160 static short idct_simple_mmx_perm[64]={ 00161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 00162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 00163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 00164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 00165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 00166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 00167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 00168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 00169 }; 00170 00171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 00172 00173 static void idct_mmx_init(void) 00174 { 00175 int i; 00176 00177 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 00178 for (i = 0; i < 64; i++) { 00179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 00180 // idct_simple_mmx_perm[i] = simple_block_permute_op(i); 00181 } 00182 } 00183 00184 DECLARE_ALIGNED(16, static DCTELEM, block)[64]; 00185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64]; 00186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64]; 00187 00188 static inline void mmx_emms(void) 00189 { 00190 #if HAVE_MMX 00191 if (cpu_flags & AV_CPU_FLAG_MMX) 00192 __asm__ volatile ("emms\n\t"); 00193 #endif 00194 } 00195 00196 static void dct_error(const char *name, int is_idct, 00197 void (*fdct_func)(DCTELEM *block), 00198 void (*fdct_ref)(DCTELEM *block), int form, int test) 00199 { 00200 int it, i, scale; 00201 int err_inf, v; 00202 int64_t err2, ti, ti1, it1; 00203 int64_t sysErr[64], sysErrMax=0; 00204 int maxout=0; 00205 int blockSumErrMax=0, blockSumErr; 00206 AVLFG prng; 00207 00208 av_lfg_init(&prng, 1); 00209 00210 err_inf = 0; 00211 err2 = 0; 00212 for(i=0; i<64; i++) sysErr[i]=0; 00213 for(it=0;it<NB_ITS;it++) { 00214 for(i=0;i<64;i++) 00215 block1[i] = 0; 00216 switch(test){ 00217 case 0: 00218 for(i=0;i<64;i++) 00219 block1[i] = (av_lfg_get(&prng) % 512) -256; 00220 if (is_idct){ 00221 ff_ref_fdct(block1); 00222 00223 for(i=0;i<64;i++) 00224 block1[i]>>=3; 00225 } 00226 break; 00227 case 1:{ 00228 int num = av_lfg_get(&prng) % 10 + 1; 00229 for(i=0;i<num;i++) 00230 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256; 00231 }break; 00232 case 2: 00233 block1[0] = av_lfg_get(&prng) % 4096 - 2048; 00234 block1[63]= (block1[0]&1)^1; 00235 break; 00236 } 00237 00238 #if 0 // simulate mismatch control 00239 { int sum=0; 00240 for(i=0;i<64;i++) 00241 sum+=block1[i]; 00242 00243 if((sum&1)==0) block1[63]^=1; 00244 } 00245 #endif 00246 00247 for(i=0; i<64; i++) 00248 block_org[i]= block1[i]; 00249 00250 if (form == MMX_PERM) { 00251 for(i=0;i<64;i++) 00252 block[idct_mmx_perm[i]] = block1[i]; 00253 } else if (form == MMX_SIMPLE_PERM) { 00254 for(i=0;i<64;i++) 00255 block[idct_simple_mmx_perm[i]] = block1[i]; 00256 00257 } else if (form == SSE2_PERM) { 00258 for(i=0; i<64; i++) 00259 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i]; 00260 } else if (form == PARTTRANS_PERM) { 00261 for(i=0; i<64; i++) 00262 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i]; 00263 } else { 00264 for(i=0; i<64; i++) 00265 block[i]= block1[i]; 00266 } 00267 #if 0 // simulate mismatch control for tested IDCT but not the ref 00268 { int sum=0; 00269 for(i=0;i<64;i++) 00270 sum+=block[i]; 00271 00272 if((sum&1)==0) block[63]^=1; 00273 } 00274 #endif 00275 00276 fdct_func(block); 00277 mmx_emms(); 00278 00279 if (form == SCALE_PERM) { 00280 for(i=0; i<64; i++) { 00281 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 00282 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS; 00283 } 00284 } 00285 00286 fdct_ref(block1); 00287 00288 blockSumErr=0; 00289 for(i=0;i<64;i++) { 00290 v = abs(block[i] - block1[i]); 00291 if (v > err_inf) 00292 err_inf = v; 00293 err2 += v * v; 00294 sysErr[i] += block[i] - block1[i]; 00295 blockSumErr += v; 00296 if( abs(block[i])>maxout) maxout=abs(block[i]); 00297 } 00298 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr; 00299 #if 0 // print different matrix pairs 00300 if(blockSumErr){ 00301 printf("\n"); 00302 for(i=0; i<64; i++){ 00303 if((i&7)==0) printf("\n"); 00304 printf("%4d ", block_org[i]); 00305 } 00306 for(i=0; i<64; i++){ 00307 if((i&7)==0) printf("\n"); 00308 printf("%4d ", block[i] - block1[i]); 00309 } 00310 } 00311 #endif 00312 } 00313 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i])); 00314 00315 for(i=0; i<64; i++){ 00316 if(i%8==0) printf("\n"); 00317 printf("%7d ", (int)sysErr[i]); 00318 } 00319 printf("\n"); 00320 00321 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 00322 is_idct ? "IDCT" : "DCT", 00323 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax); 00324 00325 /* speed test */ 00326 for(i=0;i<64;i++) 00327 block1[i] = 0; 00328 switch(test){ 00329 case 0: 00330 for(i=0;i<64;i++) 00331 block1[i] = av_lfg_get(&prng) % 512 -256; 00332 if (is_idct){ 00333 ff_ref_fdct(block1); 00334 00335 for(i=0;i<64;i++) 00336 block1[i]>>=3; 00337 } 00338 break; 00339 case 1:{ 00340 case 2: 00341 block1[0] = av_lfg_get(&prng) % 512 -256; 00342 block1[1] = av_lfg_get(&prng) % 512 -256; 00343 block1[2] = av_lfg_get(&prng) % 512 -256; 00344 block1[3] = av_lfg_get(&prng) % 512 -256; 00345 }break; 00346 } 00347 00348 if (form == MMX_PERM) { 00349 for(i=0;i<64;i++) 00350 block[idct_mmx_perm[i]] = block1[i]; 00351 } else if(form == MMX_SIMPLE_PERM) { 00352 for(i=0;i<64;i++) 00353 block[idct_simple_mmx_perm[i]] = block1[i]; 00354 } else { 00355 for(i=0; i<64; i++) 00356 block[i]= block1[i]; 00357 } 00358 00359 ti = gettime(); 00360 it1 = 0; 00361 do { 00362 for(it=0;it<NB_ITS_SPEED;it++) { 00363 for(i=0; i<64; i++) 00364 block[i]= block1[i]; 00365 // memcpy(block, block1, sizeof(DCTELEM) * 64); 00366 // do not memcpy especially not fastmemcpy because it does movntq !!! 00367 fdct_func(block); 00368 } 00369 it1 += NB_ITS_SPEED; 00370 ti1 = gettime() - ti; 00371 } while (ti1 < 1000000); 00372 mmx_emms(); 00373 00374 printf("%s %s: %0.1f kdct/s\n", 00375 is_idct ? "IDCT" : "DCT", 00376 name, (double)it1 * 1000.0 / (double)ti1); 00377 } 00378 00379 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64]; 00380 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64]; 00381 00382 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 00383 { 00384 static int init; 00385 static double c8[8][8]; 00386 static double c4[4][4]; 00387 double block1[64], block2[64], block3[64]; 00388 double s, sum, v; 00389 int i, j, k; 00390 00391 if (!init) { 00392 init = 1; 00393 00394 for(i=0;i<8;i++) { 00395 sum = 0; 00396 for(j=0;j<8;j++) { 00397 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0); 00398 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 00399 sum += c8[i][j] * c8[i][j]; 00400 } 00401 } 00402 00403 for(i=0;i<4;i++) { 00404 sum = 0; 00405 for(j=0;j<4;j++) { 00406 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0); 00407 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 00408 sum += c4[i][j] * c4[i][j]; 00409 } 00410 } 00411 } 00412 00413 /* butterfly */ 00414 s = 0.5 * sqrt(2.0); 00415 for(i=0;i<4;i++) { 00416 for(j=0;j<8;j++) { 00417 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s; 00418 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s; 00419 } 00420 } 00421 00422 /* idct8 on lines */ 00423 for(i=0;i<8;i++) { 00424 for(j=0;j<8;j++) { 00425 sum = 0; 00426 for(k=0;k<8;k++) 00427 sum += c8[k][j] * block1[8*i+k]; 00428 block2[8*i+j] = sum; 00429 } 00430 } 00431 00432 /* idct4 */ 00433 for(i=0;i<8;i++) { 00434 for(j=0;j<4;j++) { 00435 /* top */ 00436 sum = 0; 00437 for(k=0;k<4;k++) 00438 sum += c4[k][j] * block2[8*(2*k)+i]; 00439 block3[8*(2*j)+i] = sum; 00440 00441 /* bottom */ 00442 sum = 0; 00443 for(k=0;k<4;k++) 00444 sum += c4[k][j] * block2[8*(2*k+1)+i]; 00445 block3[8*(2*j+1)+i] = sum; 00446 } 00447 } 00448 00449 /* clamp and store the result */ 00450 for(i=0;i<8;i++) { 00451 for(j=0;j<8;j++) { 00452 v = block3[8*i+j]; 00453 if (v < 0) 00454 v = 0; 00455 else if (v > 255) 00456 v = 255; 00457 dest[i * linesize + j] = (int)rint(v); 00458 } 00459 } 00460 } 00461 00462 static void idct248_error(const char *name, 00463 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block)) 00464 { 00465 int it, i, it1, ti, ti1, err_max, v; 00466 00467 AVLFG prng; 00468 00469 av_lfg_init(&prng, 1); 00470 00471 /* just one test to see if code is correct (precision is less 00472 important here) */ 00473 err_max = 0; 00474 for(it=0;it<NB_ITS;it++) { 00475 00476 /* XXX: use forward transform to generate values */ 00477 for(i=0;i<64;i++) 00478 block1[i] = av_lfg_get(&prng) % 256 - 128; 00479 block1[0] += 1024; 00480 00481 for(i=0; i<64; i++) 00482 block[i]= block1[i]; 00483 idct248_ref(img_dest1, 8, block); 00484 00485 for(i=0; i<64; i++) 00486 block[i]= block1[i]; 00487 idct248_put(img_dest, 8, block); 00488 00489 for(i=0;i<64;i++) { 00490 v = abs((int)img_dest[i] - (int)img_dest1[i]); 00491 if (v == 255) 00492 printf("%d %d\n", img_dest[i], img_dest1[i]); 00493 if (v > err_max) 00494 err_max = v; 00495 } 00496 } 00497 printf("%s %s: err_inf=%d\n", 00498 1 ? "IDCT248" : "DCT248", 00499 name, err_max); 00500 00501 ti = gettime(); 00502 it1 = 0; 00503 do { 00504 for(it=0;it<NB_ITS_SPEED;it++) { 00505 for(i=0; i<64; i++) 00506 block[i]= block1[i]; 00507 // memcpy(block, block1, sizeof(DCTELEM) * 64); 00508 // do not memcpy especially not fastmemcpy because it does movntq !!! 00509 idct248_put(img_dest, 8, block); 00510 } 00511 it1 += NB_ITS_SPEED; 00512 ti1 = gettime() - ti; 00513 } while (ti1 < 1000000); 00514 mmx_emms(); 00515 00516 printf("%s %s: %0.1f kdct/s\n", 00517 1 ? "IDCT248" : "DCT248", 00518 name, (double)it1 * 1000.0 / (double)ti1); 00519 } 00520 00521 static void help(void) 00522 { 00523 printf("dct-test [-i] [<test-number>]\n" 00524 "test-number 0 -> test with random matrixes\n" 00525 " 1 -> test with random sparse matrixes\n" 00526 " 2 -> do 3. test from mpeg4 std\n" 00527 "-i test IDCT implementations\n" 00528 "-4 test IDCT248 implementations\n"); 00529 } 00530 00531 int main(int argc, char **argv) 00532 { 00533 int test_idct = 0, test_248_dct = 0; 00534 int c,i; 00535 int test=1; 00536 cpu_flags = av_get_cpu_flags(); 00537 00538 ff_ref_dct_init(); 00539 idct_mmx_init(); 00540 00541 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; 00542 for(i=0;i<MAX_NEG_CROP;i++) { 00543 cropTbl[i] = 0; 00544 cropTbl[i + MAX_NEG_CROP + 256] = 255; 00545 } 00546 00547 for(;;) { 00548 c = getopt(argc, argv, "ih4"); 00549 if (c == -1) 00550 break; 00551 switch(c) { 00552 case 'i': 00553 test_idct = 1; 00554 break; 00555 case '4': 00556 test_248_dct = 1; 00557 break; 00558 default : 00559 case 'h': 00560 help(); 00561 return 0; 00562 } 00563 } 00564 00565 if(optind <argc) test= atoi(argv[optind]); 00566 00567 printf("ffmpeg DCT/IDCT test\n"); 00568 00569 if (test_248_dct) { 00570 idct248_error("SIMPLE-C", ff_simple_idct248_put); 00571 } else { 00572 for (i=0;algos[i].name;i++) 00573 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) { 00574 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test); 00575 } 00576 } 00577 return 0; 00578 }