Libav
|
00001 /* 00002 * iWMMXt optimized DSP utils 00003 * copyright (c) 2004 AGAWA Koji 00004 * 00005 * This file is part of FFmpeg. 00006 * 00007 * FFmpeg is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * FFmpeg is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with FFmpeg; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00022 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00023 { 00024 int stride = line_size; 00025 __asm__ volatile ( 00026 "and r12, %[pixels], #7 \n\t" 00027 "bic %[pixels], %[pixels], #7 \n\t" 00028 "tmcr wcgr1, r12 \n\t" 00029 "add r4, %[pixels], %[line_size] \n\t" 00030 "add r5, %[block], %[line_size] \n\t" 00031 "mov %[line_size], %[line_size], lsl #1 \n\t" 00032 "1: \n\t" 00033 "wldrd wr0, [%[pixels]] \n\t" 00034 "subs %[h], %[h], #2 \n\t" 00035 "wldrd wr1, [%[pixels], #8] \n\t" 00036 "add %[pixels], %[pixels], %[line_size] \n\t" 00037 "wldrd wr3, [r4] \n\t" 00038 "pld [%[pixels]] \n\t" 00039 "pld [%[pixels], #32] \n\t" 00040 "wldrd wr4, [r4, #8] \n\t" 00041 "add r4, r4, %[line_size] \n\t" 00042 "walignr1 wr8, wr0, wr1 \n\t" 00043 "pld [r4] \n\t" 00044 "pld [r4, #32] \n\t" 00045 "walignr1 wr10, wr3, wr4 \n\t" 00046 "wstrd wr8, [%[block]] \n\t" 00047 "add %[block], %[block], %[line_size] \n\t" 00048 "wstrd wr10, [r5] \n\t" 00049 "add r5, r5, %[line_size] \n\t" 00050 "bne 1b \n\t" 00051 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 00052 : 00053 : "memory", "r4", "r5", "r12"); 00054 } 00055 00056 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00057 { 00058 int stride = line_size; 00059 __asm__ volatile ( 00060 "and r12, %[pixels], #7 \n\t" 00061 "bic %[pixels], %[pixels], #7 \n\t" 00062 "tmcr wcgr1, r12 \n\t" 00063 "add r4, %[pixels], %[line_size] \n\t" 00064 "add r5, %[block], %[line_size] \n\t" 00065 "mov %[line_size], %[line_size], lsl #1 \n\t" 00066 "1: \n\t" 00067 "wldrd wr0, [%[pixels]] \n\t" 00068 "subs %[h], %[h], #2 \n\t" 00069 "wldrd wr1, [%[pixels], #8] \n\t" 00070 "add %[pixels], %[pixels], %[line_size] \n\t" 00071 "wldrd wr3, [r4] \n\t" 00072 "pld [%[pixels]] \n\t" 00073 "pld [%[pixels], #32] \n\t" 00074 "wldrd wr4, [r4, #8] \n\t" 00075 "add r4, r4, %[line_size] \n\t" 00076 "walignr1 wr8, wr0, wr1 \n\t" 00077 "wldrd wr0, [%[block]] \n\t" 00078 "wldrd wr2, [r5] \n\t" 00079 "pld [r4] \n\t" 00080 "pld [r4, #32] \n\t" 00081 "walignr1 wr10, wr3, wr4 \n\t" 00082 WAVG2B" wr8, wr8, wr0 \n\t" 00083 WAVG2B" wr10, wr10, wr2 \n\t" 00084 "wstrd wr8, [%[block]] \n\t" 00085 "add %[block], %[block], %[line_size] \n\t" 00086 "wstrd wr10, [r5] \n\t" 00087 "pld [%[block]] \n\t" 00088 "pld [%[block], #32] \n\t" 00089 "add r5, r5, %[line_size] \n\t" 00090 "pld [r5] \n\t" 00091 "pld [r5, #32] \n\t" 00092 "bne 1b \n\t" 00093 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 00094 : 00095 : "memory", "r4", "r5", "r12"); 00096 } 00097 00098 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00099 { 00100 int stride = line_size; 00101 __asm__ volatile ( 00102 "and r12, %[pixels], #7 \n\t" 00103 "bic %[pixels], %[pixels], #7 \n\t" 00104 "tmcr wcgr1, r12 \n\t" 00105 "add r4, %[pixels], %[line_size] \n\t" 00106 "add r5, %[block], %[line_size] \n\t" 00107 "mov %[line_size], %[line_size], lsl #1 \n\t" 00108 "1: \n\t" 00109 "wldrd wr0, [%[pixels]] \n\t" 00110 "wldrd wr1, [%[pixels], #8] \n\t" 00111 "subs %[h], %[h], #2 \n\t" 00112 "wldrd wr2, [%[pixels], #16] \n\t" 00113 "add %[pixels], %[pixels], %[line_size] \n\t" 00114 "wldrd wr3, [r4] \n\t" 00115 "pld [%[pixels]] \n\t" 00116 "pld [%[pixels], #32] \n\t" 00117 "walignr1 wr8, wr0, wr1 \n\t" 00118 "wldrd wr4, [r4, #8] \n\t" 00119 "walignr1 wr9, wr1, wr2 \n\t" 00120 "wldrd wr5, [r4, #16] \n\t" 00121 "add r4, r4, %[line_size] \n\t" 00122 "pld [r4] \n\t" 00123 "pld [r4, #32] \n\t" 00124 "walignr1 wr10, wr3, wr4 \n\t" 00125 "wstrd wr8, [%[block]] \n\t" 00126 "walignr1 wr11, wr4, wr5 \n\t" 00127 "wstrd wr9, [%[block], #8] \n\t" 00128 "add %[block], %[block], %[line_size] \n\t" 00129 "wstrd wr10, [r5] \n\t" 00130 "wstrd wr11, [r5, #8] \n\t" 00131 "add r5, r5, %[line_size] \n\t" 00132 "bne 1b \n\t" 00133 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 00134 : 00135 : "memory", "r4", "r5", "r12"); 00136 } 00137 00138 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00139 { 00140 int stride = line_size; 00141 __asm__ volatile ( 00142 "pld [%[pixels]] \n\t" 00143 "pld [%[pixels], #32] \n\t" 00144 "pld [%[block]] \n\t" 00145 "pld [%[block], #32] \n\t" 00146 "and r12, %[pixels], #7 \n\t" 00147 "bic %[pixels], %[pixels], #7 \n\t" 00148 "tmcr wcgr1, r12 \n\t" 00149 "add r4, %[pixels], %[line_size]\n\t" 00150 "add r5, %[block], %[line_size] \n\t" 00151 "mov %[line_size], %[line_size], lsl #1 \n\t" 00152 "1: \n\t" 00153 "wldrd wr0, [%[pixels]] \n\t" 00154 "wldrd wr1, [%[pixels], #8] \n\t" 00155 "subs %[h], %[h], #2 \n\t" 00156 "wldrd wr2, [%[pixels], #16] \n\t" 00157 "add %[pixels], %[pixels], %[line_size] \n\t" 00158 "wldrd wr3, [r4] \n\t" 00159 "pld [%[pixels]] \n\t" 00160 "pld [%[pixels], #32] \n\t" 00161 "walignr1 wr8, wr0, wr1 \n\t" 00162 "wldrd wr4, [r4, #8] \n\t" 00163 "walignr1 wr9, wr1, wr2 \n\t" 00164 "wldrd wr5, [r4, #16] \n\t" 00165 "add r4, r4, %[line_size] \n\t" 00166 "wldrd wr0, [%[block]] \n\t" 00167 "pld [r4] \n\t" 00168 "wldrd wr1, [%[block], #8] \n\t" 00169 "pld [r4, #32] \n\t" 00170 "wldrd wr2, [r5] \n\t" 00171 "walignr1 wr10, wr3, wr4 \n\t" 00172 "wldrd wr3, [r5, #8] \n\t" 00173 WAVG2B" wr8, wr8, wr0 \n\t" 00174 WAVG2B" wr9, wr9, wr1 \n\t" 00175 WAVG2B" wr10, wr10, wr2 \n\t" 00176 "wstrd wr8, [%[block]] \n\t" 00177 "walignr1 wr11, wr4, wr5 \n\t" 00178 WAVG2B" wr11, wr11, wr3 \n\t" 00179 "wstrd wr9, [%[block], #8] \n\t" 00180 "add %[block], %[block], %[line_size] \n\t" 00181 "wstrd wr10, [r5] \n\t" 00182 "pld [%[block]] \n\t" 00183 "pld [%[block], #32] \n\t" 00184 "wstrd wr11, [r5, #8] \n\t" 00185 "add r5, r5, %[line_size] \n\t" 00186 "pld [r5] \n\t" 00187 "pld [r5, #32] \n\t" 00188 "bne 1b \n\t" 00189 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 00190 : 00191 : "memory", "r4", "r5", "r12"); 00192 } 00193 00194 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00195 { 00196 int stride = line_size; 00197 // [wr0 wr1 wr2 wr3] for previous line 00198 // [wr4 wr5 wr6 wr7] for current line 00199 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00200 __asm__ volatile( 00201 "pld [%[pixels]] \n\t" 00202 "pld [%[pixels], #32] \n\t" 00203 "and r12, %[pixels], #7 \n\t" 00204 "bic %[pixels], %[pixels], #7 \n\t" 00205 "tmcr wcgr1, r12 \n\t" 00206 "add r12, r12, #1 \n\t" 00207 "add r4, %[pixels], %[line_size]\n\t" 00208 "tmcr wcgr2, r12 \n\t" 00209 "add r5, %[block], %[line_size] \n\t" 00210 "mov %[line_size], %[line_size], lsl #1 \n\t" 00211 00212 "1: \n\t" 00213 "wldrd wr10, [%[pixels]] \n\t" 00214 "cmp r12, #8 \n\t" 00215 "wldrd wr11, [%[pixels], #8] \n\t" 00216 "add %[pixels], %[pixels], %[line_size] \n\t" 00217 "wldrd wr13, [r4] \n\t" 00218 "pld [%[pixels]] \n\t" 00219 "wldrd wr14, [r4, #8] \n\t" 00220 "pld [%[pixels], #32] \n\t" 00221 "add r4, r4, %[line_size] \n\t" 00222 "walignr1 wr0, wr10, wr11 \n\t" 00223 "pld [r4] \n\t" 00224 "pld [r4, #32] \n\t" 00225 "walignr1 wr2, wr13, wr14 \n\t" 00226 "wmoveq wr4, wr11 \n\t" 00227 "wmoveq wr6, wr14 \n\t" 00228 "walignr2ne wr4, wr10, wr11 \n\t" 00229 "walignr2ne wr6, wr13, wr14 \n\t" 00230 WAVG2B" wr0, wr0, wr4 \n\t" 00231 WAVG2B" wr2, wr2, wr6 \n\t" 00232 "wstrd wr0, [%[block]] \n\t" 00233 "subs %[h], %[h], #2 \n\t" 00234 "wstrd wr2, [r5] \n\t" 00235 "add %[block], %[block], %[line_size] \n\t" 00236 "add r5, r5, %[line_size] \n\t" 00237 "bne 1b \n\t" 00238 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00239 : 00240 : "r4", "r5", "r12", "memory"); 00241 } 00242 00243 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00244 { 00245 int stride = line_size; 00246 // [wr0 wr1 wr2 wr3] for previous line 00247 // [wr4 wr5 wr6 wr7] for current line 00248 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00249 __asm__ volatile( 00250 "pld [%[pixels]] \n\t" 00251 "pld [%[pixels], #32] \n\t" 00252 "and r12, %[pixels], #7 \n\t" 00253 "bic %[pixels], %[pixels], #7 \n\t" 00254 "tmcr wcgr1, r12 \n\t" 00255 "add r12, r12, #1 \n\t" 00256 "add r4, %[pixels], %[line_size]\n\t" 00257 "tmcr wcgr2, r12 \n\t" 00258 "add r5, %[block], %[line_size] \n\t" 00259 "mov %[line_size], %[line_size], lsl #1 \n\t" 00260 00261 "1: \n\t" 00262 "wldrd wr10, [%[pixels]] \n\t" 00263 "cmp r12, #8 \n\t" 00264 "wldrd wr11, [%[pixels], #8] \n\t" 00265 "wldrd wr12, [%[pixels], #16] \n\t" 00266 "add %[pixels], %[pixels], %[line_size] \n\t" 00267 "wldrd wr13, [r4] \n\t" 00268 "pld [%[pixels]] \n\t" 00269 "wldrd wr14, [r4, #8] \n\t" 00270 "pld [%[pixels], #32] \n\t" 00271 "wldrd wr15, [r4, #16] \n\t" 00272 "add r4, r4, %[line_size] \n\t" 00273 "walignr1 wr0, wr10, wr11 \n\t" 00274 "pld [r4] \n\t" 00275 "pld [r4, #32] \n\t" 00276 "walignr1 wr1, wr11, wr12 \n\t" 00277 "walignr1 wr2, wr13, wr14 \n\t" 00278 "walignr1 wr3, wr14, wr15 \n\t" 00279 "wmoveq wr4, wr11 \n\t" 00280 "wmoveq wr5, wr12 \n\t" 00281 "wmoveq wr6, wr14 \n\t" 00282 "wmoveq wr7, wr15 \n\t" 00283 "walignr2ne wr4, wr10, wr11 \n\t" 00284 "walignr2ne wr5, wr11, wr12 \n\t" 00285 "walignr2ne wr6, wr13, wr14 \n\t" 00286 "walignr2ne wr7, wr14, wr15 \n\t" 00287 WAVG2B" wr0, wr0, wr4 \n\t" 00288 WAVG2B" wr1, wr1, wr5 \n\t" 00289 "wstrd wr0, [%[block]] \n\t" 00290 WAVG2B" wr2, wr2, wr6 \n\t" 00291 "wstrd wr1, [%[block], #8] \n\t" 00292 WAVG2B" wr3, wr3, wr7 \n\t" 00293 "add %[block], %[block], %[line_size] \n\t" 00294 "wstrd wr2, [r5] \n\t" 00295 "subs %[h], %[h], #2 \n\t" 00296 "wstrd wr3, [r5, #8] \n\t" 00297 "add r5, r5, %[line_size] \n\t" 00298 "bne 1b \n\t" 00299 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00300 : 00301 : "r4", "r5", "r12", "memory"); 00302 } 00303 00304 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00305 { 00306 int stride = line_size; 00307 // [wr0 wr1 wr2 wr3] for previous line 00308 // [wr4 wr5 wr6 wr7] for current line 00309 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00310 __asm__ volatile( 00311 "pld [%[pixels]] \n\t" 00312 "pld [%[pixels], #32] \n\t" 00313 "pld [%[block]] \n\t" 00314 "pld [%[block], #32] \n\t" 00315 "and r12, %[pixels], #7 \n\t" 00316 "bic %[pixels], %[pixels], #7 \n\t" 00317 "tmcr wcgr1, r12 \n\t" 00318 "add r12, r12, #1 \n\t" 00319 "add r4, %[pixels], %[line_size]\n\t" 00320 "tmcr wcgr2, r12 \n\t" 00321 "add r5, %[block], %[line_size] \n\t" 00322 "mov %[line_size], %[line_size], lsl #1 \n\t" 00323 "pld [r5] \n\t" 00324 "pld [r5, #32] \n\t" 00325 00326 "1: \n\t" 00327 "wldrd wr10, [%[pixels]] \n\t" 00328 "cmp r12, #8 \n\t" 00329 "wldrd wr11, [%[pixels], #8] \n\t" 00330 "add %[pixels], %[pixels], %[line_size] \n\t" 00331 "wldrd wr13, [r4] \n\t" 00332 "pld [%[pixels]] \n\t" 00333 "wldrd wr14, [r4, #8] \n\t" 00334 "pld [%[pixels], #32] \n\t" 00335 "add r4, r4, %[line_size] \n\t" 00336 "walignr1 wr0, wr10, wr11 \n\t" 00337 "pld [r4] \n\t" 00338 "pld [r4, #32] \n\t" 00339 "walignr1 wr2, wr13, wr14 \n\t" 00340 "wmoveq wr4, wr11 \n\t" 00341 "wmoveq wr6, wr14 \n\t" 00342 "walignr2ne wr4, wr10, wr11 \n\t" 00343 "wldrd wr10, [%[block]] \n\t" 00344 "walignr2ne wr6, wr13, wr14 \n\t" 00345 "wldrd wr12, [r5] \n\t" 00346 WAVG2B" wr0, wr0, wr4 \n\t" 00347 WAVG2B" wr2, wr2, wr6 \n\t" 00348 WAVG2B" wr0, wr0, wr10 \n\t" 00349 WAVG2B" wr2, wr2, wr12 \n\t" 00350 "wstrd wr0, [%[block]] \n\t" 00351 "subs %[h], %[h], #2 \n\t" 00352 "wstrd wr2, [r5] \n\t" 00353 "add %[block], %[block], %[line_size] \n\t" 00354 "add r5, r5, %[line_size] \n\t" 00355 "pld [%[block]] \n\t" 00356 "pld [%[block], #32] \n\t" 00357 "pld [r5] \n\t" 00358 "pld [r5, #32] \n\t" 00359 "bne 1b \n\t" 00360 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00361 : 00362 : "r4", "r5", "r12", "memory"); 00363 } 00364 00365 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00366 { 00367 int stride = line_size; 00368 // [wr0 wr1 wr2 wr3] for previous line 00369 // [wr4 wr5 wr6 wr7] for current line 00370 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00371 __asm__ volatile( 00372 "pld [%[pixels]] \n\t" 00373 "pld [%[pixels], #32] \n\t" 00374 "pld [%[block]] \n\t" 00375 "pld [%[block], #32] \n\t" 00376 "and r12, %[pixels], #7 \n\t" 00377 "bic %[pixels], %[pixels], #7 \n\t" 00378 "tmcr wcgr1, r12 \n\t" 00379 "add r12, r12, #1 \n\t" 00380 "add r4, %[pixels], %[line_size]\n\t" 00381 "tmcr wcgr2, r12 \n\t" 00382 "add r5, %[block], %[line_size] \n\t" 00383 "mov %[line_size], %[line_size], lsl #1 \n\t" 00384 "pld [r5] \n\t" 00385 "pld [r5, #32] \n\t" 00386 00387 "1: \n\t" 00388 "wldrd wr10, [%[pixels]] \n\t" 00389 "cmp r12, #8 \n\t" 00390 "wldrd wr11, [%[pixels], #8] \n\t" 00391 "wldrd wr12, [%[pixels], #16] \n\t" 00392 "add %[pixels], %[pixels], %[line_size] \n\t" 00393 "wldrd wr13, [r4] \n\t" 00394 "pld [%[pixels]] \n\t" 00395 "wldrd wr14, [r4, #8] \n\t" 00396 "pld [%[pixels], #32] \n\t" 00397 "wldrd wr15, [r4, #16] \n\t" 00398 "add r4, r4, %[line_size] \n\t" 00399 "walignr1 wr0, wr10, wr11 \n\t" 00400 "pld [r4] \n\t" 00401 "pld [r4, #32] \n\t" 00402 "walignr1 wr1, wr11, wr12 \n\t" 00403 "walignr1 wr2, wr13, wr14 \n\t" 00404 "walignr1 wr3, wr14, wr15 \n\t" 00405 "wmoveq wr4, wr11 \n\t" 00406 "wmoveq wr5, wr12 \n\t" 00407 "wmoveq wr6, wr14 \n\t" 00408 "wmoveq wr7, wr15 \n\t" 00409 "walignr2ne wr4, wr10, wr11 \n\t" 00410 "walignr2ne wr5, wr11, wr12 \n\t" 00411 "walignr2ne wr6, wr13, wr14 \n\t" 00412 "walignr2ne wr7, wr14, wr15 \n\t" 00413 "wldrd wr10, [%[block]] \n\t" 00414 WAVG2B" wr0, wr0, wr4 \n\t" 00415 "wldrd wr11, [%[block], #8] \n\t" 00416 WAVG2B" wr1, wr1, wr5 \n\t" 00417 "wldrd wr12, [r5] \n\t" 00418 WAVG2B" wr2, wr2, wr6 \n\t" 00419 "wldrd wr13, [r5, #8] \n\t" 00420 WAVG2B" wr3, wr3, wr7 \n\t" 00421 WAVG2B" wr0, wr0, wr10 \n\t" 00422 WAVG2B" wr1, wr1, wr11 \n\t" 00423 WAVG2B" wr2, wr2, wr12 \n\t" 00424 WAVG2B" wr3, wr3, wr13 \n\t" 00425 "wstrd wr0, [%[block]] \n\t" 00426 "subs %[h], %[h], #2 \n\t" 00427 "wstrd wr1, [%[block], #8] \n\t" 00428 "add %[block], %[block], %[line_size] \n\t" 00429 "wstrd wr2, [r5] \n\t" 00430 "pld [%[block]] \n\t" 00431 "wstrd wr3, [r5, #8] \n\t" 00432 "add r5, r5, %[line_size] \n\t" 00433 "pld [%[block], #32] \n\t" 00434 "pld [r5] \n\t" 00435 "pld [r5, #32] \n\t" 00436 "bne 1b \n\t" 00437 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00438 : 00439 :"r4", "r5", "r12", "memory"); 00440 } 00441 00442 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00443 { 00444 int stride = line_size; 00445 // [wr0 wr1 wr2 wr3] for previous line 00446 // [wr4 wr5 wr6 wr7] for current line 00447 __asm__ volatile( 00448 "pld [%[pixels]] \n\t" 00449 "pld [%[pixels], #32] \n\t" 00450 "and r12, %[pixels], #7 \n\t" 00451 "tmcr wcgr1, r12 \n\t" 00452 "bic %[pixels], %[pixels], #7 \n\t" 00453 00454 "wldrd wr10, [%[pixels]] \n\t" 00455 "wldrd wr11, [%[pixels], #8] \n\t" 00456 "pld [%[block]] \n\t" 00457 "add %[pixels], %[pixels], %[line_size] \n\t" 00458 "walignr1 wr0, wr10, wr11 \n\t" 00459 "pld [%[pixels]] \n\t" 00460 "pld [%[pixels], #32] \n\t" 00461 00462 "1: \n\t" 00463 "wldrd wr10, [%[pixels]] \n\t" 00464 "wldrd wr11, [%[pixels], #8] \n\t" 00465 "add %[pixels], %[pixels], %[line_size] \n\t" 00466 "pld [%[pixels]] \n\t" 00467 "pld [%[pixels], #32] \n\t" 00468 "walignr1 wr4, wr10, wr11 \n\t" 00469 "wldrd wr10, [%[block]] \n\t" 00470 WAVG2B" wr8, wr0, wr4 \n\t" 00471 WAVG2B" wr8, wr8, wr10 \n\t" 00472 "wstrd wr8, [%[block]] \n\t" 00473 "add %[block], %[block], %[line_size] \n\t" 00474 00475 "wldrd wr10, [%[pixels]] \n\t" 00476 "wldrd wr11, [%[pixels], #8] \n\t" 00477 "pld [%[block]] \n\t" 00478 "add %[pixels], %[pixels], %[line_size] \n\t" 00479 "pld [%[pixels]] \n\t" 00480 "pld [%[pixels], #32] \n\t" 00481 "walignr1 wr0, wr10, wr11 \n\t" 00482 "wldrd wr10, [%[block]] \n\t" 00483 WAVG2B" wr8, wr0, wr4 \n\t" 00484 WAVG2B" wr8, wr8, wr10 \n\t" 00485 "wstrd wr8, [%[block]] \n\t" 00486 "add %[block], %[block], %[line_size] \n\t" 00487 00488 "subs %[h], %[h], #2 \n\t" 00489 "pld [%[block]] \n\t" 00490 "bne 1b \n\t" 00491 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00492 : 00493 : "cc", "memory", "r12"); 00494 } 00495 00496 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00497 { 00498 int stride = line_size; 00499 // [wr0 wr1 wr2 wr3] for previous line 00500 // [wr4 wr5 wr6 wr7] for current line 00501 __asm__ volatile( 00502 "pld [%[pixels]] \n\t" 00503 "pld [%[pixels], #32] \n\t" 00504 "and r12, %[pixels], #7 \n\t" 00505 "tmcr wcgr1, r12 \n\t" 00506 "bic %[pixels], %[pixels], #7 \n\t" 00507 00508 "wldrd wr10, [%[pixels]] \n\t" 00509 "wldrd wr11, [%[pixels], #8] \n\t" 00510 "wldrd wr12, [%[pixels], #16] \n\t" 00511 "add %[pixels], %[pixels], %[line_size] \n\t" 00512 "pld [%[pixels]] \n\t" 00513 "pld [%[pixels], #32] \n\t" 00514 "walignr1 wr0, wr10, wr11 \n\t" 00515 "walignr1 wr1, wr11, wr12 \n\t" 00516 00517 "1: \n\t" 00518 "wldrd wr10, [%[pixels]] \n\t" 00519 "wldrd wr11, [%[pixels], #8] \n\t" 00520 "wldrd wr12, [%[pixels], #16] \n\t" 00521 "add %[pixels], %[pixels], %[line_size] \n\t" 00522 "pld [%[pixels]] \n\t" 00523 "pld [%[pixels], #32] \n\t" 00524 "walignr1 wr4, wr10, wr11 \n\t" 00525 "walignr1 wr5, wr11, wr12 \n\t" 00526 WAVG2B" wr8, wr0, wr4 \n\t" 00527 WAVG2B" wr9, wr1, wr5 \n\t" 00528 "wstrd wr8, [%[block]] \n\t" 00529 "wstrd wr9, [%[block], #8] \n\t" 00530 "add %[block], %[block], %[line_size] \n\t" 00531 00532 "wldrd wr10, [%[pixels]] \n\t" 00533 "wldrd wr11, [%[pixels], #8] \n\t" 00534 "wldrd wr12, [%[pixels], #16] \n\t" 00535 "add %[pixels], %[pixels], %[line_size] \n\t" 00536 "pld [%[pixels]] \n\t" 00537 "pld [%[pixels], #32] \n\t" 00538 "walignr1 wr0, wr10, wr11 \n\t" 00539 "walignr1 wr1, wr11, wr12 \n\t" 00540 WAVG2B" wr8, wr0, wr4 \n\t" 00541 WAVG2B" wr9, wr1, wr5 \n\t" 00542 "wstrd wr8, [%[block]] \n\t" 00543 "wstrd wr9, [%[block], #8] \n\t" 00544 "add %[block], %[block], %[line_size] \n\t" 00545 00546 "subs %[h], %[h], #2 \n\t" 00547 "bne 1b \n\t" 00548 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00549 : 00550 : "r4", "r5", "r12", "memory"); 00551 } 00552 00553 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00554 { 00555 int stride = line_size; 00556 // [wr0 wr1 wr2 wr3] for previous line 00557 // [wr4 wr5 wr6 wr7] for current line 00558 __asm__ volatile( 00559 "pld [%[pixels]] \n\t" 00560 "pld [%[pixels], #32] \n\t" 00561 "and r12, %[pixels], #7 \n\t" 00562 "tmcr wcgr1, r12 \n\t" 00563 "bic %[pixels], %[pixels], #7 \n\t" 00564 00565 "wldrd wr10, [%[pixels]] \n\t" 00566 "wldrd wr11, [%[pixels], #8] \n\t" 00567 "pld [%[block]] \n\t" 00568 "wldrd wr12, [%[pixels], #16] \n\t" 00569 "add %[pixels], %[pixels], %[line_size] \n\t" 00570 "pld [%[pixels]] \n\t" 00571 "pld [%[pixels], #32] \n\t" 00572 "walignr1 wr0, wr10, wr11 \n\t" 00573 "walignr1 wr1, wr11, wr12 \n\t" 00574 00575 "1: \n\t" 00576 "wldrd wr10, [%[pixels]] \n\t" 00577 "wldrd wr11, [%[pixels], #8] \n\t" 00578 "wldrd wr12, [%[pixels], #16] \n\t" 00579 "add %[pixels], %[pixels], %[line_size] \n\t" 00580 "pld [%[pixels]] \n\t" 00581 "pld [%[pixels], #32] \n\t" 00582 "walignr1 wr4, wr10, wr11 \n\t" 00583 "walignr1 wr5, wr11, wr12 \n\t" 00584 "wldrd wr10, [%[block]] \n\t" 00585 "wldrd wr11, [%[block], #8] \n\t" 00586 WAVG2B" wr8, wr0, wr4 \n\t" 00587 WAVG2B" wr9, wr1, wr5 \n\t" 00588 WAVG2B" wr8, wr8, wr10 \n\t" 00589 WAVG2B" wr9, wr9, wr11 \n\t" 00590 "wstrd wr8, [%[block]] \n\t" 00591 "wstrd wr9, [%[block], #8] \n\t" 00592 "add %[block], %[block], %[line_size] \n\t" 00593 00594 "wldrd wr10, [%[pixels]] \n\t" 00595 "wldrd wr11, [%[pixels], #8] \n\t" 00596 "pld [%[block]] \n\t" 00597 "wldrd wr12, [%[pixels], #16] \n\t" 00598 "add %[pixels], %[pixels], %[line_size] \n\t" 00599 "pld [%[pixels]] \n\t" 00600 "pld [%[pixels], #32] \n\t" 00601 "walignr1 wr0, wr10, wr11 \n\t" 00602 "walignr1 wr1, wr11, wr12 \n\t" 00603 "wldrd wr10, [%[block]] \n\t" 00604 "wldrd wr11, [%[block], #8] \n\t" 00605 WAVG2B" wr8, wr0, wr4 \n\t" 00606 WAVG2B" wr9, wr1, wr5 \n\t" 00607 WAVG2B" wr8, wr8, wr10 \n\t" 00608 WAVG2B" wr9, wr9, wr11 \n\t" 00609 "wstrd wr8, [%[block]] \n\t" 00610 "wstrd wr9, [%[block], #8] \n\t" 00611 "add %[block], %[block], %[line_size] \n\t" 00612 00613 "subs %[h], %[h], #2 \n\t" 00614 "pld [%[block]] \n\t" 00615 "bne 1b \n\t" 00616 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 00617 : 00618 : "r4", "r5", "r12", "memory"); 00619 } 00620 00621 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00622 { 00623 // [wr0 wr1 wr2 wr3] for previous line 00624 // [wr4 wr5 wr6 wr7] for current line 00625 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00626 __asm__ volatile( 00627 "pld [%[pixels]] \n\t" 00628 "mov r12, #2 \n\t" 00629 "pld [%[pixels], #32] \n\t" 00630 "tmcr wcgr0, r12 \n\t" /* for shift value */ 00631 "and r12, %[pixels], #7 \n\t" 00632 "bic %[pixels], %[pixels], #7 \n\t" 00633 "tmcr wcgr1, r12 \n\t" 00634 00635 // [wr0 wr1 wr2 wr3] <= * 00636 // [wr4 wr5 wr6 wr7] 00637 "wldrd wr12, [%[pixels]] \n\t" 00638 "add r12, r12, #1 \n\t" 00639 "wldrd wr13, [%[pixels], #8] \n\t" 00640 "tmcr wcgr2, r12 \n\t" 00641 "add %[pixels], %[pixels], %[line_size] \n\t" 00642 "cmp r12, #8 \n\t" 00643 "pld [%[pixels]] \n\t" 00644 "pld [%[pixels], #32] \n\t" 00645 "walignr1 wr2, wr12, wr13 \n\t" 00646 "wmoveq wr10, wr13 \n\t" 00647 "walignr2ne wr10, wr12, wr13 \n\t" 00648 "wunpckelub wr0, wr2 \n\t" 00649 "wunpckehub wr1, wr2 \n\t" 00650 "wunpckelub wr8, wr10 \n\t" 00651 "wunpckehub wr9, wr10 \n\t" 00652 "waddhus wr0, wr0, wr8 \n\t" 00653 "waddhus wr1, wr1, wr9 \n\t" 00654 00655 "1: \n\t" 00656 // [wr0 wr1 wr2 wr3] 00657 // [wr4 wr5 wr6 wr7] <= * 00658 "wldrd wr12, [%[pixels]] \n\t" 00659 "cmp r12, #8 \n\t" 00660 "wldrd wr13, [%[pixels], #8] \n\t" 00661 "add %[pixels], %[pixels], %[line_size] \n\t" 00662 "walignr1 wr6, wr12, wr13 \n\t" 00663 "pld [%[pixels]] \n\t" 00664 "pld [%[pixels], #32] \n\t" 00665 "wmoveq wr10, wr13 \n\t" 00666 "walignr2ne wr10, wr12, wr13 \n\t" 00667 "wunpckelub wr4, wr6 \n\t" 00668 "wunpckehub wr5, wr6 \n\t" 00669 "wunpckelub wr8, wr10 \n\t" 00670 "wunpckehub wr9, wr10 \n\t" 00671 "waddhus wr4, wr4, wr8 \n\t" 00672 "waddhus wr5, wr5, wr9 \n\t" 00673 "waddhus wr8, wr0, wr4 \n\t" 00674 "waddhus wr9, wr1, wr5 \n\t" 00675 "waddhus wr8, wr8, wr15 \n\t" 00676 "waddhus wr9, wr9, wr15 \n\t" 00677 "wsrlhg wr8, wr8, wcgr0 \n\t" 00678 "wsrlhg wr9, wr9, wcgr0 \n\t" 00679 "wpackhus wr8, wr8, wr9 \n\t" 00680 "wstrd wr8, [%[block]] \n\t" 00681 "add %[block], %[block], %[line_size] \n\t" 00682 00683 // [wr0 wr1 wr2 wr3] <= * 00684 // [wr4 wr5 wr6 wr7] 00685 "wldrd wr12, [%[pixels]] \n\t" 00686 "wldrd wr13, [%[pixels], #8] \n\t" 00687 "add %[pixels], %[pixels], %[line_size] \n\t" 00688 "walignr1 wr2, wr12, wr13 \n\t" 00689 "pld [%[pixels]] \n\t" 00690 "pld [%[pixels], #32] \n\t" 00691 "wmoveq wr10, wr13 \n\t" 00692 "walignr2ne wr10, wr12, wr13 \n\t" 00693 "wunpckelub wr0, wr2 \n\t" 00694 "wunpckehub wr1, wr2 \n\t" 00695 "wunpckelub wr8, wr10 \n\t" 00696 "wunpckehub wr9, wr10 \n\t" 00697 "waddhus wr0, wr0, wr8 \n\t" 00698 "waddhus wr1, wr1, wr9 \n\t" 00699 "waddhus wr8, wr0, wr4 \n\t" 00700 "waddhus wr9, wr1, wr5 \n\t" 00701 "waddhus wr8, wr8, wr15 \n\t" 00702 "waddhus wr9, wr9, wr15 \n\t" 00703 "wsrlhg wr8, wr8, wcgr0 \n\t" 00704 "wsrlhg wr9, wr9, wcgr0 \n\t" 00705 "wpackhus wr8, wr8, wr9 \n\t" 00706 "subs %[h], %[h], #2 \n\t" 00707 "wstrd wr8, [%[block]] \n\t" 00708 "add %[block], %[block], %[line_size] \n\t" 00709 "bne 1b \n\t" 00710 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 00711 : [line_size]"r"(line_size) 00712 : "r12", "memory"); 00713 } 00714 00715 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00716 { 00717 // [wr0 wr1 wr2 wr3] for previous line 00718 // [wr4 wr5 wr6 wr7] for current line 00719 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00720 __asm__ volatile( 00721 "pld [%[pixels]] \n\t" 00722 "mov r12, #2 \n\t" 00723 "pld [%[pixels], #32] \n\t" 00724 "tmcr wcgr0, r12 \n\t" /* for shift value */ 00725 /* alignment */ 00726 "and r12, %[pixels], #7 \n\t" 00727 "bic %[pixels], %[pixels], #7 \n\t" 00728 "tmcr wcgr1, r12 \n\t" 00729 "add r12, r12, #1 \n\t" 00730 "tmcr wcgr2, r12 \n\t" 00731 00732 // [wr0 wr1 wr2 wr3] <= * 00733 // [wr4 wr5 wr6 wr7] 00734 "wldrd wr12, [%[pixels]] \n\t" 00735 "cmp r12, #8 \n\t" 00736 "wldrd wr13, [%[pixels], #8] \n\t" 00737 "wldrd wr14, [%[pixels], #16] \n\t" 00738 "add %[pixels], %[pixels], %[line_size] \n\t" 00739 "pld [%[pixels]] \n\t" 00740 "walignr1 wr2, wr12, wr13 \n\t" 00741 "pld [%[pixels], #32] \n\t" 00742 "walignr1 wr3, wr13, wr14 \n\t" 00743 "wmoveq wr10, wr13 \n\t" 00744 "wmoveq wr11, wr14 \n\t" 00745 "walignr2ne wr10, wr12, wr13 \n\t" 00746 "walignr2ne wr11, wr13, wr14 \n\t" 00747 "wunpckelub wr0, wr2 \n\t" 00748 "wunpckehub wr1, wr2 \n\t" 00749 "wunpckelub wr2, wr3 \n\t" 00750 "wunpckehub wr3, wr3 \n\t" 00751 "wunpckelub wr8, wr10 \n\t" 00752 "wunpckehub wr9, wr10 \n\t" 00753 "wunpckelub wr10, wr11 \n\t" 00754 "wunpckehub wr11, wr11 \n\t" 00755 "waddhus wr0, wr0, wr8 \n\t" 00756 "waddhus wr1, wr1, wr9 \n\t" 00757 "waddhus wr2, wr2, wr10 \n\t" 00758 "waddhus wr3, wr3, wr11 \n\t" 00759 00760 "1: \n\t" 00761 // [wr0 wr1 wr2 wr3] 00762 // [wr4 wr5 wr6 wr7] <= * 00763 "wldrd wr12, [%[pixels]] \n\t" 00764 "cmp r12, #8 \n\t" 00765 "wldrd wr13, [%[pixels], #8] \n\t" 00766 "wldrd wr14, [%[pixels], #16] \n\t" 00767 "add %[pixels], %[pixels], %[line_size] \n\t" 00768 "walignr1 wr6, wr12, wr13 \n\t" 00769 "pld [%[pixels]] \n\t" 00770 "pld [%[pixels], #32] \n\t" 00771 "walignr1 wr7, wr13, wr14 \n\t" 00772 "wmoveq wr10, wr13 \n\t" 00773 "wmoveq wr11, wr14 \n\t" 00774 "walignr2ne wr10, wr12, wr13 \n\t" 00775 "walignr2ne wr11, wr13, wr14 \n\t" 00776 "wunpckelub wr4, wr6 \n\t" 00777 "wunpckehub wr5, wr6 \n\t" 00778 "wunpckelub wr6, wr7 \n\t" 00779 "wunpckehub wr7, wr7 \n\t" 00780 "wunpckelub wr8, wr10 \n\t" 00781 "wunpckehub wr9, wr10 \n\t" 00782 "wunpckelub wr10, wr11 \n\t" 00783 "wunpckehub wr11, wr11 \n\t" 00784 "waddhus wr4, wr4, wr8 \n\t" 00785 "waddhus wr5, wr5, wr9 \n\t" 00786 "waddhus wr6, wr6, wr10 \n\t" 00787 "waddhus wr7, wr7, wr11 \n\t" 00788 "waddhus wr8, wr0, wr4 \n\t" 00789 "waddhus wr9, wr1, wr5 \n\t" 00790 "waddhus wr10, wr2, wr6 \n\t" 00791 "waddhus wr11, wr3, wr7 \n\t" 00792 "waddhus wr8, wr8, wr15 \n\t" 00793 "waddhus wr9, wr9, wr15 \n\t" 00794 "waddhus wr10, wr10, wr15 \n\t" 00795 "waddhus wr11, wr11, wr15 \n\t" 00796 "wsrlhg wr8, wr8, wcgr0 \n\t" 00797 "wsrlhg wr9, wr9, wcgr0 \n\t" 00798 "wsrlhg wr10, wr10, wcgr0 \n\t" 00799 "wsrlhg wr11, wr11, wcgr0 \n\t" 00800 "wpackhus wr8, wr8, wr9 \n\t" 00801 "wpackhus wr9, wr10, wr11 \n\t" 00802 "wstrd wr8, [%[block]] \n\t" 00803 "wstrd wr9, [%[block], #8] \n\t" 00804 "add %[block], %[block], %[line_size] \n\t" 00805 00806 // [wr0 wr1 wr2 wr3] <= * 00807 // [wr4 wr5 wr6 wr7] 00808 "wldrd wr12, [%[pixels]] \n\t" 00809 "wldrd wr13, [%[pixels], #8] \n\t" 00810 "wldrd wr14, [%[pixels], #16] \n\t" 00811 "add %[pixels], %[pixels], %[line_size] \n\t" 00812 "walignr1 wr2, wr12, wr13 \n\t" 00813 "pld [%[pixels]] \n\t" 00814 "pld [%[pixels], #32] \n\t" 00815 "walignr1 wr3, wr13, wr14 \n\t" 00816 "wmoveq wr10, wr13 \n\t" 00817 "wmoveq wr11, wr14 \n\t" 00818 "walignr2ne wr10, wr12, wr13 \n\t" 00819 "walignr2ne wr11, wr13, wr14 \n\t" 00820 "wunpckelub wr0, wr2 \n\t" 00821 "wunpckehub wr1, wr2 \n\t" 00822 "wunpckelub wr2, wr3 \n\t" 00823 "wunpckehub wr3, wr3 \n\t" 00824 "wunpckelub wr8, wr10 \n\t" 00825 "wunpckehub wr9, wr10 \n\t" 00826 "wunpckelub wr10, wr11 \n\t" 00827 "wunpckehub wr11, wr11 \n\t" 00828 "waddhus wr0, wr0, wr8 \n\t" 00829 "waddhus wr1, wr1, wr9 \n\t" 00830 "waddhus wr2, wr2, wr10 \n\t" 00831 "waddhus wr3, wr3, wr11 \n\t" 00832 "waddhus wr8, wr0, wr4 \n\t" 00833 "waddhus wr9, wr1, wr5 \n\t" 00834 "waddhus wr10, wr2, wr6 \n\t" 00835 "waddhus wr11, wr3, wr7 \n\t" 00836 "waddhus wr8, wr8, wr15 \n\t" 00837 "waddhus wr9, wr9, wr15 \n\t" 00838 "waddhus wr10, wr10, wr15 \n\t" 00839 "waddhus wr11, wr11, wr15 \n\t" 00840 "wsrlhg wr8, wr8, wcgr0 \n\t" 00841 "wsrlhg wr9, wr9, wcgr0 \n\t" 00842 "wsrlhg wr10, wr10, wcgr0 \n\t" 00843 "wsrlhg wr11, wr11, wcgr0 \n\t" 00844 "wpackhus wr8, wr8, wr9 \n\t" 00845 "wpackhus wr9, wr10, wr11 \n\t" 00846 "wstrd wr8, [%[block]] \n\t" 00847 "wstrd wr9, [%[block], #8] \n\t" 00848 "add %[block], %[block], %[line_size] \n\t" 00849 00850 "subs %[h], %[h], #2 \n\t" 00851 "bne 1b \n\t" 00852 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 00853 : [line_size]"r"(line_size) 00854 : "r12", "memory"); 00855 } 00856 00857 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00858 { 00859 // [wr0 wr1 wr2 wr3] for previous line 00860 // [wr4 wr5 wr6 wr7] for current line 00861 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00862 __asm__ volatile( 00863 "pld [%[block]] \n\t" 00864 "pld [%[block], #32] \n\t" 00865 "pld [%[pixels]] \n\t" 00866 "mov r12, #2 \n\t" 00867 "pld [%[pixels], #32] \n\t" 00868 "tmcr wcgr0, r12 \n\t" /* for shift value */ 00869 "and r12, %[pixels], #7 \n\t" 00870 "bic %[pixels], %[pixels], #7 \n\t" 00871 "tmcr wcgr1, r12 \n\t" 00872 00873 // [wr0 wr1 wr2 wr3] <= * 00874 // [wr4 wr5 wr6 wr7] 00875 "wldrd wr12, [%[pixels]] \n\t" 00876 "add r12, r12, #1 \n\t" 00877 "wldrd wr13, [%[pixels], #8] \n\t" 00878 "tmcr wcgr2, r12 \n\t" 00879 "add %[pixels], %[pixels], %[line_size] \n\t" 00880 "cmp r12, #8 \n\t" 00881 "pld [%[pixels]] \n\t" 00882 "pld [%[pixels], #32] \n\t" 00883 "walignr1 wr2, wr12, wr13 \n\t" 00884 "wmoveq wr10, wr13 \n\t" 00885 "walignr2ne wr10, wr12, wr13 \n\t" 00886 "wunpckelub wr0, wr2 \n\t" 00887 "wunpckehub wr1, wr2 \n\t" 00888 "wunpckelub wr8, wr10 \n\t" 00889 "wunpckehub wr9, wr10 \n\t" 00890 "waddhus wr0, wr0, wr8 \n\t" 00891 "waddhus wr1, wr1, wr9 \n\t" 00892 00893 "1: \n\t" 00894 // [wr0 wr1 wr2 wr3] 00895 // [wr4 wr5 wr6 wr7] <= * 00896 "wldrd wr12, [%[pixels]] \n\t" 00897 "cmp r12, #8 \n\t" 00898 "wldrd wr13, [%[pixels], #8] \n\t" 00899 "add %[pixels], %[pixels], %[line_size] \n\t" 00900 "walignr1 wr6, wr12, wr13 \n\t" 00901 "pld [%[pixels]] \n\t" 00902 "pld [%[pixels], #32] \n\t" 00903 "wmoveq wr10, wr13 \n\t" 00904 "walignr2ne wr10, wr12, wr13 \n\t" 00905 "wunpckelub wr4, wr6 \n\t" 00906 "wunpckehub wr5, wr6 \n\t" 00907 "wunpckelub wr8, wr10 \n\t" 00908 "wunpckehub wr9, wr10 \n\t" 00909 "waddhus wr4, wr4, wr8 \n\t" 00910 "waddhus wr5, wr5, wr9 \n\t" 00911 "waddhus wr8, wr0, wr4 \n\t" 00912 "waddhus wr9, wr1, wr5 \n\t" 00913 "waddhus wr8, wr8, wr15 \n\t" 00914 "waddhus wr9, wr9, wr15 \n\t" 00915 "wldrd wr12, [%[block]] \n\t" 00916 "wsrlhg wr8, wr8, wcgr0 \n\t" 00917 "wsrlhg wr9, wr9, wcgr0 \n\t" 00918 "wpackhus wr8, wr8, wr9 \n\t" 00919 WAVG2B" wr8, wr8, wr12 \n\t" 00920 "wstrd wr8, [%[block]] \n\t" 00921 "add %[block], %[block], %[line_size] \n\t" 00922 "wldrd wr12, [%[pixels]] \n\t" 00923 "pld [%[block]] \n\t" 00924 "pld [%[block], #32] \n\t" 00925 00926 // [wr0 wr1 wr2 wr3] <= * 00927 // [wr4 wr5 wr6 wr7] 00928 "wldrd wr13, [%[pixels], #8] \n\t" 00929 "add %[pixels], %[pixels], %[line_size] \n\t" 00930 "walignr1 wr2, wr12, wr13 \n\t" 00931 "pld [%[pixels]] \n\t" 00932 "pld [%[pixels], #32] \n\t" 00933 "wmoveq wr10, wr13 \n\t" 00934 "walignr2ne wr10, wr12, wr13 \n\t" 00935 "wunpckelub wr0, wr2 \n\t" 00936 "wunpckehub wr1, wr2 \n\t" 00937 "wunpckelub wr8, wr10 \n\t" 00938 "wunpckehub wr9, wr10 \n\t" 00939 "waddhus wr0, wr0, wr8 \n\t" 00940 "waddhus wr1, wr1, wr9 \n\t" 00941 "waddhus wr8, wr0, wr4 \n\t" 00942 "waddhus wr9, wr1, wr5 \n\t" 00943 "waddhus wr8, wr8, wr15 \n\t" 00944 "waddhus wr9, wr9, wr15 \n\t" 00945 "wldrd wr12, [%[block]] \n\t" 00946 "wsrlhg wr8, wr8, wcgr0 \n\t" 00947 "wsrlhg wr9, wr9, wcgr0 \n\t" 00948 "wpackhus wr8, wr8, wr9 \n\t" 00949 "subs %[h], %[h], #2 \n\t" 00950 WAVG2B" wr8, wr8, wr12 \n\t" 00951 "wstrd wr8, [%[block]] \n\t" 00952 "add %[block], %[block], %[line_size] \n\t" 00953 "pld [%[block]] \n\t" 00954 "pld [%[block], #32] \n\t" 00955 "bne 1b \n\t" 00956 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 00957 : [line_size]"r"(line_size) 00958 : "r12", "memory"); 00959 } 00960 00961 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 00962 { 00963 // [wr0 wr1 wr2 wr3] for previous line 00964 // [wr4 wr5 wr6 wr7] for current line 00965 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 00966 __asm__ volatile( 00967 "pld [%[block]] \n\t" 00968 "pld [%[block], #32] \n\t" 00969 "pld [%[pixels]] \n\t" 00970 "mov r12, #2 \n\t" 00971 "pld [%[pixels], #32] \n\t" 00972 "tmcr wcgr0, r12 \n\t" /* for shift value */ 00973 /* alignment */ 00974 "and r12, %[pixels], #7 \n\t" 00975 "bic %[pixels], %[pixels], #7 \n\t" 00976 "tmcr wcgr1, r12 \n\t" 00977 "add r12, r12, #1 \n\t" 00978 "tmcr wcgr2, r12 \n\t" 00979 00980 // [wr0 wr1 wr2 wr3] <= * 00981 // [wr4 wr5 wr6 wr7] 00982 "wldrd wr12, [%[pixels]] \n\t" 00983 "cmp r12, #8 \n\t" 00984 "wldrd wr13, [%[pixels], #8] \n\t" 00985 "wldrd wr14, [%[pixels], #16] \n\t" 00986 "add %[pixels], %[pixels], %[line_size] \n\t" 00987 "pld [%[pixels]] \n\t" 00988 "walignr1 wr2, wr12, wr13 \n\t" 00989 "pld [%[pixels], #32] \n\t" 00990 "walignr1 wr3, wr13, wr14 \n\t" 00991 "wmoveq wr10, wr13 \n\t" 00992 "wmoveq wr11, wr14 \n\t" 00993 "walignr2ne wr10, wr12, wr13 \n\t" 00994 "walignr2ne wr11, wr13, wr14 \n\t" 00995 "wunpckelub wr0, wr2 \n\t" 00996 "wunpckehub wr1, wr2 \n\t" 00997 "wunpckelub wr2, wr3 \n\t" 00998 "wunpckehub wr3, wr3 \n\t" 00999 "wunpckelub wr8, wr10 \n\t" 01000 "wunpckehub wr9, wr10 \n\t" 01001 "wunpckelub wr10, wr11 \n\t" 01002 "wunpckehub wr11, wr11 \n\t" 01003 "waddhus wr0, wr0, wr8 \n\t" 01004 "waddhus wr1, wr1, wr9 \n\t" 01005 "waddhus wr2, wr2, wr10 \n\t" 01006 "waddhus wr3, wr3, wr11 \n\t" 01007 01008 "1: \n\t" 01009 // [wr0 wr1 wr2 wr3] 01010 // [wr4 wr5 wr6 wr7] <= * 01011 "wldrd wr12, [%[pixels]] \n\t" 01012 "cmp r12, #8 \n\t" 01013 "wldrd wr13, [%[pixels], #8] \n\t" 01014 "wldrd wr14, [%[pixels], #16] \n\t" 01015 "add %[pixels], %[pixels], %[line_size] \n\t" 01016 "walignr1 wr6, wr12, wr13 \n\t" 01017 "pld [%[pixels]] \n\t" 01018 "pld [%[pixels], #32] \n\t" 01019 "walignr1 wr7, wr13, wr14 \n\t" 01020 "wmoveq wr10, wr13 \n\t" 01021 "wmoveq wr11, wr14 \n\t" 01022 "walignr2ne wr10, wr12, wr13 \n\t" 01023 "walignr2ne wr11, wr13, wr14 \n\t" 01024 "wunpckelub wr4, wr6 \n\t" 01025 "wunpckehub wr5, wr6 \n\t" 01026 "wunpckelub wr6, wr7 \n\t" 01027 "wunpckehub wr7, wr7 \n\t" 01028 "wunpckelub wr8, wr10 \n\t" 01029 "wunpckehub wr9, wr10 \n\t" 01030 "wunpckelub wr10, wr11 \n\t" 01031 "wunpckehub wr11, wr11 \n\t" 01032 "waddhus wr4, wr4, wr8 \n\t" 01033 "waddhus wr5, wr5, wr9 \n\t" 01034 "waddhus wr6, wr6, wr10 \n\t" 01035 "waddhus wr7, wr7, wr11 \n\t" 01036 "waddhus wr8, wr0, wr4 \n\t" 01037 "waddhus wr9, wr1, wr5 \n\t" 01038 "waddhus wr10, wr2, wr6 \n\t" 01039 "waddhus wr11, wr3, wr7 \n\t" 01040 "waddhus wr8, wr8, wr15 \n\t" 01041 "waddhus wr9, wr9, wr15 \n\t" 01042 "waddhus wr10, wr10, wr15 \n\t" 01043 "waddhus wr11, wr11, wr15 \n\t" 01044 "wsrlhg wr8, wr8, wcgr0 \n\t" 01045 "wsrlhg wr9, wr9, wcgr0 \n\t" 01046 "wldrd wr12, [%[block]] \n\t" 01047 "wldrd wr13, [%[block], #8] \n\t" 01048 "wsrlhg wr10, wr10, wcgr0 \n\t" 01049 "wsrlhg wr11, wr11, wcgr0 \n\t" 01050 "wpackhus wr8, wr8, wr9 \n\t" 01051 "wpackhus wr9, wr10, wr11 \n\t" 01052 WAVG2B" wr8, wr8, wr12 \n\t" 01053 WAVG2B" wr9, wr9, wr13 \n\t" 01054 "wstrd wr8, [%[block]] \n\t" 01055 "wstrd wr9, [%[block], #8] \n\t" 01056 "add %[block], %[block], %[line_size] \n\t" 01057 01058 // [wr0 wr1 wr2 wr3] <= * 01059 // [wr4 wr5 wr6 wr7] 01060 "wldrd wr12, [%[pixels]] \n\t" 01061 "pld [%[block]] \n\t" 01062 "wldrd wr13, [%[pixels], #8] \n\t" 01063 "pld [%[block], #32] \n\t" 01064 "wldrd wr14, [%[pixels], #16] \n\t" 01065 "add %[pixels], %[pixels], %[line_size] \n\t" 01066 "walignr1 wr2, wr12, wr13 \n\t" 01067 "pld [%[pixels]] \n\t" 01068 "pld [%[pixels], #32] \n\t" 01069 "walignr1 wr3, wr13, wr14 \n\t" 01070 "wmoveq wr10, wr13 \n\t" 01071 "wmoveq wr11, wr14 \n\t" 01072 "walignr2ne wr10, wr12, wr13 \n\t" 01073 "walignr2ne wr11, wr13, wr14 \n\t" 01074 "wunpckelub wr0, wr2 \n\t" 01075 "wunpckehub wr1, wr2 \n\t" 01076 "wunpckelub wr2, wr3 \n\t" 01077 "wunpckehub wr3, wr3 \n\t" 01078 "wunpckelub wr8, wr10 \n\t" 01079 "wunpckehub wr9, wr10 \n\t" 01080 "wunpckelub wr10, wr11 \n\t" 01081 "wunpckehub wr11, wr11 \n\t" 01082 "waddhus wr0, wr0, wr8 \n\t" 01083 "waddhus wr1, wr1, wr9 \n\t" 01084 "waddhus wr2, wr2, wr10 \n\t" 01085 "waddhus wr3, wr3, wr11 \n\t" 01086 "waddhus wr8, wr0, wr4 \n\t" 01087 "waddhus wr9, wr1, wr5 \n\t" 01088 "waddhus wr10, wr2, wr6 \n\t" 01089 "waddhus wr11, wr3, wr7 \n\t" 01090 "waddhus wr8, wr8, wr15 \n\t" 01091 "waddhus wr9, wr9, wr15 \n\t" 01092 "waddhus wr10, wr10, wr15 \n\t" 01093 "waddhus wr11, wr11, wr15 \n\t" 01094 "wsrlhg wr8, wr8, wcgr0 \n\t" 01095 "wsrlhg wr9, wr9, wcgr0 \n\t" 01096 "wldrd wr12, [%[block]] \n\t" 01097 "wldrd wr13, [%[block], #8] \n\t" 01098 "wsrlhg wr10, wr10, wcgr0 \n\t" 01099 "wsrlhg wr11, wr11, wcgr0 \n\t" 01100 "wpackhus wr8, wr8, wr9 \n\t" 01101 "wpackhus wr9, wr10, wr11 \n\t" 01102 WAVG2B" wr8, wr8, wr12 \n\t" 01103 WAVG2B" wr9, wr9, wr13 \n\t" 01104 "wstrd wr8, [%[block]] \n\t" 01105 "wstrd wr9, [%[block], #8] \n\t" 01106 "add %[block], %[block], %[line_size] \n\t" 01107 "subs %[h], %[h], #2 \n\t" 01108 "pld [%[block]] \n\t" 01109 "pld [%[block], #32] \n\t" 01110 "bne 1b \n\t" 01111 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 01112 : [line_size]"r"(line_size) 01113 : "r12", "memory"); 01114 }