Libav 0.7.1
libavcodec/dnxhdenc.c
Go to the documentation of this file.
00001 /*
00002  * VC3/DNxHD encoder
00003  * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
00004  *
00005  * VC-3 encoder funded by the British Broadcasting Corporation
00006  *
00007  * This file is part of Libav.
00008  *
00009  * Libav is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU Lesser General Public
00011  * License as published by the Free Software Foundation; either
00012  * version 2.1 of the License, or (at your option) any later version.
00013  *
00014  * Libav is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017  * Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with Libav; if not, write to the Free Software
00021  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00022  */
00023 
00024 //#define DEBUG
00025 #define RC_VARIANCE 1 // use variance or ssd for fast rc
00026 
00027 #include "libavutil/opt.h"
00028 #include "avcodec.h"
00029 #include "dsputil.h"
00030 #include "mpegvideo.h"
00031 #include "dnxhdenc.h"
00032 
00033 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
00034 
00035 static const AVOption options[]={
00036     {"nitris_compat", "encode with Avid Nitris compatibility", offsetof(DNXHDEncContext, nitris_compat), FF_OPT_TYPE_INT, {.dbl = 0}, 0, 1, VE},
00037 {NULL}
00038 };
00039 static const AVClass class = { "dnxhd", av_default_item_name, options, LIBAVUTIL_VERSION_INT };
00040 
00041 int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
00042 
00043 #define LAMBDA_FRAC_BITS 10
00044 
00045 static av_always_inline void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00046 {
00047     int i;
00048     for (i = 0; i < 4; i++) {
00049         block[0] = pixels[0]; block[1] = pixels[1];
00050         block[2] = pixels[2]; block[3] = pixels[3];
00051         block[4] = pixels[4]; block[5] = pixels[5];
00052         block[6] = pixels[6]; block[7] = pixels[7];
00053         pixels += line_size;
00054         block += 8;
00055     }
00056     memcpy(block   , block- 8, sizeof(*block)*8);
00057     memcpy(block+ 8, block-16, sizeof(*block)*8);
00058     memcpy(block+16, block-24, sizeof(*block)*8);
00059     memcpy(block+24, block-32, sizeof(*block)*8);
00060 }
00061 
00062 static int dnxhd_init_vlc(DNXHDEncContext *ctx)
00063 {
00064     int i, j, level, run;
00065     int max_level = 1<<(ctx->cid_table->bit_depth+2);
00066 
00067     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes, max_level*4*sizeof(*ctx->vlc_codes), fail);
00068     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits , max_level*4*sizeof(*ctx->vlc_bits ), fail);
00069     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes, 63*2                               , fail);
00070     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits , 63                                 , fail);
00071 
00072     ctx->vlc_codes += max_level*2;
00073     ctx->vlc_bits  += max_level*2;
00074     for (level = -max_level; level < max_level; level++) {
00075         for (run = 0; run < 2; run++) {
00076             int index = (level<<1)|run;
00077             int sign, offset = 0, alevel = level;
00078 
00079             MASK_ABS(sign, alevel);
00080             if (alevel > 64) {
00081                 offset = (alevel-1)>>6;
00082                 alevel -= offset<<6;
00083             }
00084             for (j = 0; j < 257; j++) {
00085                 if (ctx->cid_table->ac_level[j] == alevel &&
00086                     (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) &&
00087                     (!run    || (ctx->cid_table->ac_run_flag  [j] && run))) {
00088                     assert(!ctx->vlc_codes[index]);
00089                     if (alevel) {
00090                         ctx->vlc_codes[index] = (ctx->cid_table->ac_codes[j]<<1)|(sign&1);
00091                         ctx->vlc_bits [index] = ctx->cid_table->ac_bits[j]+1;
00092                     } else {
00093                         ctx->vlc_codes[index] = ctx->cid_table->ac_codes[j];
00094                         ctx->vlc_bits [index] = ctx->cid_table->ac_bits [j];
00095                     }
00096                     break;
00097                 }
00098             }
00099             assert(!alevel || j < 257);
00100             if (offset) {
00101                 ctx->vlc_codes[index] = (ctx->vlc_codes[index]<<ctx->cid_table->index_bits)|offset;
00102                 ctx->vlc_bits [index]+= ctx->cid_table->index_bits;
00103             }
00104         }
00105     }
00106     for (i = 0; i < 62; i++) {
00107         int run = ctx->cid_table->run[i];
00108         assert(run < 63);
00109         ctx->run_codes[run] = ctx->cid_table->run_codes[i];
00110         ctx->run_bits [run] = ctx->cid_table->run_bits[i];
00111     }
00112     return 0;
00113  fail:
00114     return -1;
00115 }
00116 
00117 static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
00118 {
00119     // init first elem to 1 to avoid div by 0 in convert_matrix
00120     uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t*
00121     int qscale, i;
00122 
00123     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int)     , fail);
00124     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int)     , fail);
00125     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
00126     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
00127 
00128     for (i = 1; i < 64; i++) {
00129         int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
00130         weight_matrix[j] = ctx->cid_table->luma_weight[i];
00131     }
00132     ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
00133                       ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
00134     for (i = 1; i < 64; i++) {
00135         int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
00136         weight_matrix[j] = ctx->cid_table->chroma_weight[i];
00137     }
00138     ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
00139                       ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
00140     for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
00141         for (i = 0; i < 64; i++) {
00142             ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
00143             ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
00144             ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
00145         }
00146     }
00147     return 0;
00148  fail:
00149     return -1;
00150 }
00151 
00152 static int dnxhd_init_rc(DNXHDEncContext *ctx)
00153 {
00154     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc, 8160*ctx->m.avctx->qmax*sizeof(RCEntry), fail);
00155     if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
00156         FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp, ctx->m.mb_num*sizeof(RCCMPEntry), fail);
00157 
00158     ctx->frame_bits = (ctx->cid_table->coding_unit_size - 640 - 4 - ctx->min_padding) * 8;
00159     ctx->qscale = 1;
00160     ctx->lambda = 2<<LAMBDA_FRAC_BITS; // qscale 2
00161     return 0;
00162  fail:
00163     return -1;
00164 }
00165 
00166 static int dnxhd_encode_init(AVCodecContext *avctx)
00167 {
00168     DNXHDEncContext *ctx = avctx->priv_data;
00169     int i, index;
00170 
00171     ctx->cid = ff_dnxhd_find_cid(avctx);
00172     if (!ctx->cid || avctx->pix_fmt != PIX_FMT_YUV422P) {
00173         av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n");
00174         return -1;
00175     }
00176     av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
00177 
00178     index = ff_dnxhd_get_cid_table(ctx->cid);
00179     ctx->cid_table = &ff_dnxhd_cid_table[index];
00180 
00181     ctx->m.avctx = avctx;
00182     ctx->m.mb_intra = 1;
00183     ctx->m.h263_aic = 1;
00184 
00185     ctx->get_pixels_8x4_sym = dnxhd_get_pixels_8x4;
00186 
00187     dsputil_init(&ctx->m.dsp, avctx);
00188     ff_dct_common_init(&ctx->m);
00189 #if HAVE_MMX
00190     ff_dnxhd_init_mmx(ctx);
00191 #endif
00192     if (!ctx->m.dct_quantize)
00193         ctx->m.dct_quantize = dct_quantize_c;
00194 
00195     ctx->m.mb_height = (avctx->height + 15) / 16;
00196     ctx->m.mb_width  = (avctx->width  + 15) / 16;
00197 
00198     if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) {
00199         ctx->interlaced = 1;
00200         ctx->m.mb_height /= 2;
00201     }
00202 
00203     ctx->m.mb_num = ctx->m.mb_height * ctx->m.mb_width;
00204 
00205     if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS)
00206         ctx->m.intra_quant_bias = avctx->intra_quant_bias;
00207     if (dnxhd_init_qmat(ctx, ctx->m.intra_quant_bias, 0) < 0) // XXX tune lbias/cbias
00208         return -1;
00209 
00210     // Avid Nitris hardware decoder requires a minimum amount of padding in the coding unit payload
00211     if (ctx->nitris_compat)
00212         ctx->min_padding = 1600;
00213 
00214     if (dnxhd_init_vlc(ctx) < 0)
00215         return -1;
00216     if (dnxhd_init_rc(ctx) < 0)
00217         return -1;
00218 
00219     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail);
00220     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail);
00221     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits,    ctx->m.mb_num   *sizeof(uint16_t), fail);
00222     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale,  ctx->m.mb_num   *sizeof(uint8_t) , fail);
00223 
00224     ctx->frame.key_frame = 1;
00225     ctx->frame.pict_type = AV_PICTURE_TYPE_I;
00226     ctx->m.avctx->coded_frame = &ctx->frame;
00227 
00228     if (avctx->thread_count > MAX_THREADS) {
00229         av_log(avctx, AV_LOG_ERROR, "too many threads\n");
00230         return -1;
00231     }
00232 
00233     ctx->thread[0] = ctx;
00234     for (i = 1; i < avctx->thread_count; i++) {
00235         ctx->thread[i] =  av_malloc(sizeof(DNXHDEncContext));
00236         memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
00237     }
00238 
00239     return 0;
00240  fail: //for FF_ALLOCZ_OR_GOTO
00241     return -1;
00242 }
00243 
00244 static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
00245 {
00246     DNXHDEncContext *ctx = avctx->priv_data;
00247     const uint8_t header_prefix[5] = { 0x00,0x00,0x02,0x80,0x01 };
00248 
00249     memset(buf, 0, 640);
00250 
00251     memcpy(buf, header_prefix, 5);
00252     buf[5] = ctx->interlaced ? ctx->cur_field+2 : 0x01;
00253     buf[6] = 0x80; // crc flag off
00254     buf[7] = 0xa0; // reserved
00255     AV_WB16(buf + 0x18, avctx->height>>ctx->interlaced); // ALPF
00256     AV_WB16(buf + 0x1a, avctx->width);  // SPL
00257     AV_WB16(buf + 0x1d, avctx->height>>ctx->interlaced); // NAL
00258 
00259     buf[0x21] = 0x38; // FIXME 8 bit per comp
00260     buf[0x22] = 0x88 + (ctx->interlaced<<2);
00261     AV_WB32(buf + 0x28, ctx->cid); // CID
00262     buf[0x2c] = ctx->interlaced ? 0 : 0x80;
00263 
00264     buf[0x5f] = 0x01; // UDL
00265 
00266     buf[0x167] = 0x02; // reserved
00267     AV_WB16(buf + 0x16a, ctx->m.mb_height * 4 + 4); // MSIPS
00268     buf[0x16d] = ctx->m.mb_height; // Ns
00269     buf[0x16f] = 0x10; // reserved
00270 
00271     ctx->msip = buf + 0x170;
00272     return 0;
00273 }
00274 
00275 static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
00276 {
00277     int nbits;
00278     if (diff < 0) {
00279         nbits = av_log2_16bit(-2*diff);
00280         diff--;
00281     } else {
00282         nbits = av_log2_16bit(2*diff);
00283     }
00284     put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
00285              (ctx->cid_table->dc_codes[nbits]<<nbits) + (diff & ((1 << nbits) - 1)));
00286 }
00287 
00288 static av_always_inline void dnxhd_encode_block(DNXHDEncContext *ctx, DCTELEM *block, int last_index, int n)
00289 {
00290     int last_non_zero = 0;
00291     int slevel, i, j;
00292 
00293     dnxhd_encode_dc(ctx, block[0] - ctx->m.last_dc[n]);
00294     ctx->m.last_dc[n] = block[0];
00295 
00296     for (i = 1; i <= last_index; i++) {
00297         j = ctx->m.intra_scantable.permutated[i];
00298         slevel = block[j];
00299         if (slevel) {
00300             int run_level = i - last_non_zero - 1;
00301             int rlevel = (slevel<<1)|!!run_level;
00302             put_bits(&ctx->m.pb, ctx->vlc_bits[rlevel], ctx->vlc_codes[rlevel]);
00303             if (run_level)
00304                 put_bits(&ctx->m.pb, ctx->run_bits[run_level], ctx->run_codes[run_level]);
00305             last_non_zero = i;
00306         }
00307     }
00308     put_bits(&ctx->m.pb, ctx->vlc_bits[0], ctx->vlc_codes[0]); // EOB
00309 }
00310 
00311 static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *block, int n, int qscale, int last_index)
00312 {
00313     const uint8_t *weight_matrix;
00314     int level;
00315     int i;
00316 
00317     weight_matrix = (n&2) ? ctx->cid_table->chroma_weight : ctx->cid_table->luma_weight;
00318 
00319     for (i = 1; i <= last_index; i++) {
00320         int j = ctx->m.intra_scantable.permutated[i];
00321         level = block[j];
00322         if (level) {
00323             if (level < 0) {
00324                 level = (1-2*level) * qscale * weight_matrix[i];
00325                 if (weight_matrix[i] != 32)
00326                     level += 32;
00327                 level >>= 6;
00328                 level = -level;
00329             } else {
00330                 level = (2*level+1) * qscale * weight_matrix[i];
00331                 if (weight_matrix[i] != 32)
00332                     level += 32;
00333                 level >>= 6;
00334             }
00335             block[j] = level;
00336         }
00337     }
00338 }
00339 
00340 static av_always_inline int dnxhd_ssd_block(DCTELEM *qblock, DCTELEM *block)
00341 {
00342     int score = 0;
00343     int i;
00344     for (i = 0; i < 64; i++)
00345         score += (block[i]-qblock[i])*(block[i]-qblock[i]);
00346     return score;
00347 }
00348 
00349 static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *block, int last_index)
00350 {
00351     int last_non_zero = 0;
00352     int bits = 0;
00353     int i, j, level;
00354     for (i = 1; i <= last_index; i++) {
00355         j = ctx->m.intra_scantable.permutated[i];
00356         level = block[j];
00357         if (level) {
00358             int run_level = i - last_non_zero - 1;
00359             bits += ctx->vlc_bits[(level<<1)|!!run_level]+ctx->run_bits[run_level];
00360             last_non_zero = i;
00361         }
00362     }
00363     return bits;
00364 }
00365 
00366 static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
00367 {
00368     const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << 4);
00369     const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
00370     const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
00371     DSPContext *dsp = &ctx->m.dsp;
00372 
00373     dsp->get_pixels(ctx->blocks[0], ptr_y    , ctx->m.linesize);
00374     dsp->get_pixels(ctx->blocks[1], ptr_y + 8, ctx->m.linesize);
00375     dsp->get_pixels(ctx->blocks[2], ptr_u    , ctx->m.uvlinesize);
00376     dsp->get_pixels(ctx->blocks[3], ptr_v    , ctx->m.uvlinesize);
00377 
00378     if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
00379         if (ctx->interlaced) {
00380             ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset    , ctx->m.linesize);
00381             ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
00382             ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset   , ctx->m.uvlinesize);
00383             ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset   , ctx->m.uvlinesize);
00384         } else {
00385             dsp->clear_block(ctx->blocks[4]); dsp->clear_block(ctx->blocks[5]);
00386             dsp->clear_block(ctx->blocks[6]); dsp->clear_block(ctx->blocks[7]);
00387         }
00388     } else {
00389         dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset    , ctx->m.linesize);
00390         dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
00391         dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset   , ctx->m.uvlinesize);
00392         dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset   , ctx->m.uvlinesize);
00393     }
00394 }
00395 
00396 static av_always_inline int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
00397 {
00398     if (i&2) {
00399         ctx->m.q_intra_matrix16 = ctx->qmatrix_c16;
00400         ctx->m.q_intra_matrix   = ctx->qmatrix_c;
00401         return 1 + (i&1);
00402     } else {
00403         ctx->m.q_intra_matrix16 = ctx->qmatrix_l16;
00404         ctx->m.q_intra_matrix   = ctx->qmatrix_l;
00405         return 0;
00406     }
00407 }
00408 
00409 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
00410 {
00411     DNXHDEncContext *ctx = avctx->priv_data;
00412     int mb_y = jobnr, mb_x;
00413     int qscale = ctx->qscale;
00414     LOCAL_ALIGNED_16(DCTELEM, block, [64]);
00415     ctx = ctx->thread[threadnr];
00416 
00417     ctx->m.last_dc[0] =
00418     ctx->m.last_dc[1] =
00419     ctx->m.last_dc[2] = 1024;
00420 
00421     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
00422         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
00423         int ssd     = 0;
00424         int ac_bits = 0;
00425         int dc_bits = 0;
00426         int i;
00427 
00428         dnxhd_get_blocks(ctx, mb_x, mb_y);
00429 
00430         for (i = 0; i < 8; i++) {
00431             DCTELEM *src_block = ctx->blocks[i];
00432             int overflow, nbits, diff, last_index;
00433             int n = dnxhd_switch_matrix(ctx, i);
00434 
00435             memcpy(block, src_block, 64*sizeof(*block));
00436             last_index = ctx->m.dct_quantize(&ctx->m, block, i, qscale, &overflow);
00437             ac_bits += dnxhd_calc_ac_bits(ctx, block, last_index);
00438 
00439             diff = block[0] - ctx->m.last_dc[n];
00440             if (diff < 0) nbits = av_log2_16bit(-2*diff);
00441             else          nbits = av_log2_16bit( 2*diff);
00442             dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;
00443 
00444             ctx->m.last_dc[n] = block[0];
00445 
00446             if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) {
00447                 dnxhd_unquantize_c(ctx, block, i, qscale, last_index);
00448                 ctx->m.dsp.idct(block);
00449                 ssd += dnxhd_ssd_block(block, src_block);
00450             }
00451         }
00452         ctx->mb_rc[qscale][mb].ssd = ssd;
00453         ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
00454     }
00455     return 0;
00456 }
00457 
00458 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
00459 {
00460     DNXHDEncContext *ctx = avctx->priv_data;
00461     int mb_y = jobnr, mb_x;
00462     ctx = ctx->thread[threadnr];
00463     init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]);
00464 
00465     ctx->m.last_dc[0] =
00466     ctx->m.last_dc[1] =
00467     ctx->m.last_dc[2] = 1024;
00468     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
00469         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
00470         int qscale = ctx->mb_qscale[mb];
00471         int i;
00472 
00473         put_bits(&ctx->m.pb, 12, qscale<<1);
00474 
00475         dnxhd_get_blocks(ctx, mb_x, mb_y);
00476 
00477         for (i = 0; i < 8; i++) {
00478             DCTELEM *block = ctx->blocks[i];
00479             int last_index, overflow;
00480             int n = dnxhd_switch_matrix(ctx, i);
00481             last_index = ctx->m.dct_quantize(&ctx->m, block, i, qscale, &overflow);
00482             //START_TIMER;
00483             dnxhd_encode_block(ctx, block, last_index, n);
00484             //STOP_TIMER("encode_block");
00485         }
00486     }
00487     if (put_bits_count(&ctx->m.pb)&31)
00488         put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
00489     flush_put_bits(&ctx->m.pb);
00490     return 0;
00491 }
00492 
00493 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx)
00494 {
00495     int mb_y, mb_x;
00496     int offset = 0;
00497     for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) {
00498         int thread_size;
00499         ctx->slice_offs[mb_y] = offset;
00500             ctx->slice_size[mb_y] = 0;
00501             for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
00502                 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
00503                 ctx->slice_size[mb_y] += ctx->mb_bits[mb];
00504             }
00505             ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
00506             ctx->slice_size[mb_y] >>= 3;
00507             thread_size = ctx->slice_size[mb_y];
00508         offset += thread_size;
00509     }
00510 }
00511 
00512 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
00513 {
00514     DNXHDEncContext *ctx = avctx->priv_data;
00515     int mb_y = jobnr, mb_x;
00516     ctx = ctx->thread[threadnr];
00517     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
00518         unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
00519         uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
00520         int sum      = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
00521         int varc     = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
00522         ctx->mb_cmp[mb].value = varc;
00523         ctx->mb_cmp[mb].mb = mb;
00524     }
00525     return 0;
00526 }
00527 
00528 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
00529 {
00530     int lambda, up_step, down_step;
00531     int last_lower = INT_MAX, last_higher = 0;
00532     int x, y, q;
00533 
00534     for (q = 1; q < avctx->qmax; q++) {
00535         ctx->qscale = q;
00536         avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
00537     }
00538     up_step = down_step = 2<<LAMBDA_FRAC_BITS;
00539     lambda = ctx->lambda;
00540 
00541     for (;;) {
00542         int bits = 0;
00543         int end = 0;
00544         if (lambda == last_higher) {
00545             lambda++;
00546             end = 1; // need to set final qscales/bits
00547         }
00548         for (y = 0; y < ctx->m.mb_height; y++) {
00549             for (x = 0; x < ctx->m.mb_width; x++) {
00550                 unsigned min = UINT_MAX;
00551                 int qscale = 1;
00552                 int mb = y*ctx->m.mb_width+x;
00553                 for (q = 1; q < avctx->qmax; q++) {
00554                     unsigned score = ctx->mb_rc[q][mb].bits*lambda+(ctx->mb_rc[q][mb].ssd<<LAMBDA_FRAC_BITS);
00555                     if (score < min) {
00556                         min = score;
00557                         qscale = q;
00558                     }
00559                 }
00560                 bits += ctx->mb_rc[qscale][mb].bits;
00561                 ctx->mb_qscale[mb] = qscale;
00562                 ctx->mb_bits[mb] = ctx->mb_rc[qscale][mb].bits;
00563             }
00564             bits = (bits+31)&~31; // padding
00565             if (bits > ctx->frame_bits)
00566                 break;
00567         }
00568         //av_dlog(ctx->m.avctx, "lambda %d, up %u, down %u, bits %d, frame %d\n",
00569         //        lambda, last_higher, last_lower, bits, ctx->frame_bits);
00570         if (end) {
00571             if (bits > ctx->frame_bits)
00572                 return -1;
00573             break;
00574         }
00575         if (bits < ctx->frame_bits) {
00576             last_lower = FFMIN(lambda, last_lower);
00577             if (last_higher != 0)
00578                 lambda = (lambda+last_higher)>>1;
00579             else
00580                 lambda -= down_step;
00581             down_step *= 5; // XXX tune ?
00582             up_step = 1<<LAMBDA_FRAC_BITS;
00583             lambda = FFMAX(1, lambda);
00584             if (lambda == last_lower)
00585                 break;
00586         } else {
00587             last_higher = FFMAX(lambda, last_higher);
00588             if (last_lower != INT_MAX)
00589                 lambda = (lambda+last_lower)>>1;
00590             else if ((int64_t)lambda + up_step > INT_MAX)
00591                 return -1;
00592             else
00593                 lambda += up_step;
00594             up_step = FFMIN((int64_t)up_step*5, INT_MAX);
00595             down_step = 1<<LAMBDA_FRAC_BITS;
00596         }
00597     }
00598     //av_dlog(ctx->m.avctx, "out lambda %d\n", lambda);
00599     ctx->lambda = lambda;
00600     return 0;
00601 }
00602 
00603 static int dnxhd_find_qscale(DNXHDEncContext *ctx)
00604 {
00605     int bits = 0;
00606     int up_step = 1;
00607     int down_step = 1;
00608     int last_higher = 0;
00609     int last_lower = INT_MAX;
00610     int qscale;
00611     int x, y;
00612 
00613     qscale = ctx->qscale;
00614     for (;;) {
00615         bits = 0;
00616         ctx->qscale = qscale;
00617         // XXX avoid recalculating bits
00618         ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
00619         for (y = 0; y < ctx->m.mb_height; y++) {
00620             for (x = 0; x < ctx->m.mb_width; x++)
00621                 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
00622             bits = (bits+31)&~31; // padding
00623             if (bits > ctx->frame_bits)
00624                 break;
00625         }
00626         //av_dlog(ctx->m.avctx, "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
00627         //        ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits, last_higher, last_lower);
00628         if (bits < ctx->frame_bits) {
00629             if (qscale == 1)
00630                 return 1;
00631             if (last_higher == qscale - 1) {
00632                 qscale = last_higher;
00633                 break;
00634             }
00635             last_lower = FFMIN(qscale, last_lower);
00636             if (last_higher != 0)
00637                 qscale = (qscale+last_higher)>>1;
00638             else
00639                 qscale -= down_step++;
00640             if (qscale < 1)
00641                 qscale = 1;
00642             up_step = 1;
00643         } else {
00644             if (last_lower == qscale + 1)
00645                 break;
00646             last_higher = FFMAX(qscale, last_higher);
00647             if (last_lower != INT_MAX)
00648                 qscale = (qscale+last_lower)>>1;
00649             else
00650                 qscale += up_step++;
00651             down_step = 1;
00652             if (qscale >= ctx->m.avctx->qmax)
00653                 return -1;
00654         }
00655     }
00656     //av_dlog(ctx->m.avctx, "out qscale %d\n", qscale);
00657     ctx->qscale = qscale;
00658     return 0;
00659 }
00660 
00661 #define BUCKET_BITS 8
00662 #define RADIX_PASSES 4
00663 #define NBUCKETS (1 << BUCKET_BITS)
00664 
00665 static inline int get_bucket(int value, int shift)
00666 {
00667     value >>= shift;
00668     value &= NBUCKETS - 1;
00669     return NBUCKETS - 1 - value;
00670 }
00671 
00672 static void radix_count(const RCCMPEntry *data, int size, int buckets[RADIX_PASSES][NBUCKETS])
00673 {
00674     int i, j;
00675     memset(buckets, 0, sizeof(buckets[0][0]) * RADIX_PASSES * NBUCKETS);
00676     for (i = 0; i < size; i++) {
00677         int v = data[i].value;
00678         for (j = 0; j < RADIX_PASSES; j++) {
00679             buckets[j][get_bucket(v, 0)]++;
00680             v >>= BUCKET_BITS;
00681         }
00682         assert(!v);
00683     }
00684     for (j = 0; j < RADIX_PASSES; j++) {
00685         int offset = size;
00686         for (i = NBUCKETS - 1; i >= 0; i--)
00687             buckets[j][i] = offset -= buckets[j][i];
00688         assert(!buckets[j][0]);
00689     }
00690 }
00691 
00692 static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data, int size, int buckets[NBUCKETS], int pass)
00693 {
00694     int shift = pass * BUCKET_BITS;
00695     int i;
00696     for (i = 0; i < size; i++) {
00697         int v = get_bucket(data[i].value, shift);
00698         int pos = buckets[v]++;
00699         dst[pos] = data[i];
00700     }
00701 }
00702 
00703 static void radix_sort(RCCMPEntry *data, int size)
00704 {
00705     int buckets[RADIX_PASSES][NBUCKETS];
00706     RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
00707     radix_count(data, size, buckets);
00708     radix_sort_pass(tmp, data, size, buckets[0], 0);
00709     radix_sort_pass(data, tmp, size, buckets[1], 1);
00710     if (buckets[2][NBUCKETS - 1] || buckets[3][NBUCKETS - 1]) {
00711         radix_sort_pass(tmp, data, size, buckets[2], 2);
00712         radix_sort_pass(data, tmp, size, buckets[3], 3);
00713     }
00714     av_free(tmp);
00715 }
00716 
00717 static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
00718 {
00719     int max_bits = 0;
00720     int ret, x, y;
00721     if ((ret = dnxhd_find_qscale(ctx)) < 0)
00722         return -1;
00723     for (y = 0; y < ctx->m.mb_height; y++) {
00724         for (x = 0; x < ctx->m.mb_width; x++) {
00725             int mb = y*ctx->m.mb_width+x;
00726             int delta_bits;
00727             ctx->mb_qscale[mb] = ctx->qscale;
00728             ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale][mb].bits;
00729             max_bits += ctx->mb_rc[ctx->qscale][mb].bits;
00730             if (!RC_VARIANCE) {
00731                 delta_bits = ctx->mb_rc[ctx->qscale][mb].bits-ctx->mb_rc[ctx->qscale+1][mb].bits;
00732                 ctx->mb_cmp[mb].mb = mb;
00733                 ctx->mb_cmp[mb].value = delta_bits ?
00734                     ((ctx->mb_rc[ctx->qscale][mb].ssd-ctx->mb_rc[ctx->qscale+1][mb].ssd)*100)/delta_bits
00735                     : INT_MIN; //avoid increasing qscale
00736             }
00737         }
00738         max_bits += 31; //worst padding
00739     }
00740     if (!ret) {
00741         if (RC_VARIANCE)
00742             avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height);
00743         radix_sort(ctx->mb_cmp, ctx->m.mb_num);
00744         for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
00745             int mb = ctx->mb_cmp[x].mb;
00746             max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits;
00747             ctx->mb_qscale[mb] = ctx->qscale+1;
00748             ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale+1][mb].bits;
00749         }
00750     }
00751     return 0;
00752 }
00753 
00754 static void dnxhd_load_picture(DNXHDEncContext *ctx, const AVFrame *frame)
00755 {
00756     int i;
00757 
00758     for (i = 0; i < 3; i++) {
00759         ctx->frame.data[i]     = frame->data[i];
00760         ctx->frame.linesize[i] = frame->linesize[i];
00761     }
00762 
00763     for (i = 0; i < ctx->m.avctx->thread_count; i++) {
00764         ctx->thread[i]->m.linesize    = ctx->frame.linesize[0]<<ctx->interlaced;
00765         ctx->thread[i]->m.uvlinesize  = ctx->frame.linesize[1]<<ctx->interlaced;
00766         ctx->thread[i]->dct_y_offset  = ctx->m.linesize  *8;
00767         ctx->thread[i]->dct_uv_offset = ctx->m.uvlinesize*8;
00768     }
00769 
00770     ctx->frame.interlaced_frame = frame->interlaced_frame;
00771     ctx->cur_field = frame->interlaced_frame && !frame->top_field_first;
00772 }
00773 
00774 static int dnxhd_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data)
00775 {
00776     DNXHDEncContext *ctx = avctx->priv_data;
00777     int first_field = 1;
00778     int offset, i, ret;
00779 
00780     if (buf_size < ctx->cid_table->frame_size) {
00781         av_log(avctx, AV_LOG_ERROR, "output buffer is too small to compress picture\n");
00782         return -1;
00783     }
00784 
00785     dnxhd_load_picture(ctx, data);
00786 
00787  encode_coding_unit:
00788     for (i = 0; i < 3; i++) {
00789         ctx->src[i] = ctx->frame.data[i];
00790         if (ctx->interlaced && ctx->cur_field)
00791             ctx->src[i] += ctx->frame.linesize[i];
00792     }
00793 
00794     dnxhd_write_header(avctx, buf);
00795 
00796     if (avctx->mb_decision == FF_MB_DECISION_RD)
00797         ret = dnxhd_encode_rdo(avctx, ctx);
00798     else
00799         ret = dnxhd_encode_fast(avctx, ctx);
00800     if (ret < 0) {
00801         av_log(avctx, AV_LOG_ERROR,
00802                "picture could not fit ratecontrol constraints, increase qmax\n");
00803         return -1;
00804     }
00805 
00806     dnxhd_setup_threads_slices(ctx);
00807 
00808     offset = 0;
00809     for (i = 0; i < ctx->m.mb_height; i++) {
00810         AV_WB32(ctx->msip + i * 4, offset);
00811         offset += ctx->slice_size[i];
00812         assert(!(ctx->slice_size[i] & 3));
00813     }
00814 
00815     avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
00816 
00817     assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
00818     memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);
00819 
00820     AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF
00821 
00822     if (ctx->interlaced && first_field) {
00823         first_field     = 0;
00824         ctx->cur_field ^= 1;
00825         buf      += ctx->cid_table->coding_unit_size;
00826         buf_size -= ctx->cid_table->coding_unit_size;
00827         goto encode_coding_unit;
00828     }
00829 
00830     ctx->frame.quality = ctx->qscale*FF_QP2LAMBDA;
00831 
00832     return ctx->cid_table->frame_size;
00833 }
00834 
00835 static int dnxhd_encode_end(AVCodecContext *avctx)
00836 {
00837     DNXHDEncContext *ctx = avctx->priv_data;
00838     int max_level = 1<<(ctx->cid_table->bit_depth+2);
00839     int i;
00840 
00841     av_free(ctx->vlc_codes-max_level*2);
00842     av_free(ctx->vlc_bits -max_level*2);
00843     av_freep(&ctx->run_codes);
00844     av_freep(&ctx->run_bits);
00845 
00846     av_freep(&ctx->mb_bits);
00847     av_freep(&ctx->mb_qscale);
00848     av_freep(&ctx->mb_rc);
00849     av_freep(&ctx->mb_cmp);
00850     av_freep(&ctx->slice_size);
00851     av_freep(&ctx->slice_offs);
00852 
00853     av_freep(&ctx->qmatrix_c);
00854     av_freep(&ctx->qmatrix_l);
00855     av_freep(&ctx->qmatrix_c16);
00856     av_freep(&ctx->qmatrix_l16);
00857 
00858     for (i = 1; i < avctx->thread_count; i++)
00859         av_freep(&ctx->thread[i]);
00860 
00861     return 0;
00862 }
00863 
00864 AVCodec ff_dnxhd_encoder = {
00865     "dnxhd",
00866     AVMEDIA_TYPE_VIDEO,
00867     CODEC_ID_DNXHD,
00868     sizeof(DNXHDEncContext),
00869     dnxhd_encode_init,
00870     dnxhd_encode_picture,
00871     dnxhd_encode_end,
00872     .capabilities = CODEC_CAP_SLICE_THREADS,
00873     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_NONE},
00874     .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
00875     .priv_class = &class,
00876 };