ref: a0320dac5d0eea33be9d7dd3b226598e4dadcf48
dir: /codec/encoder/core/arm64/reconstruct_aarch64_neon.S/
/*! * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifdef HAVE_NEON_AARCH64 .text #include "arm_arch64_common_macro.S" #ifdef __APPLE__ .macro ZERO_COUNT_IN_2_QUARWORD // { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q cmeq $0.8h, $0.8h, #0 cmeq $1.8h, $1.8h, #0 uzp1 $0.16b, $0.16b, $1.16b ushr $0.16b, $0.16b, 7 addv $2, $0.16b // } .endm .macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0; saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0) smull $4.4s, $1.4h, $2.4h smull2 $5.4s, $1.8h, $2.8h shrn $1.4h, $4.4s, #16 shrn2 $1.8h, $5.4s, #16 cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111 bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched shl $3.8h, $3.8h, #1 sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x // } .endm .macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0; saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0) smull $4.4s, $1.4h, $2.4h smull2 $5.4s, $1.8h, $2.8h shrn $1.4h, $4.4s, #16 shrn2 $1.8h, $5.4s, #16 cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111 bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched shl $3.8h, $3.8h, #1 mov.16b $6, $1 sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x // } .endm .macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0) smull $4.4s, $1.4h, $2.4h shrn $1.4h, $4.4s, #16 cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111 bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched shl $3.8h, $3.8h, #1 sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x // } .endm .macro SELECT_MAX_IN_ABS_COEF // { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two) umax $0.8h, $0.8h, $1.8h umaxv $4, $0.8h umax $2.8h, $2.8h, $3.8h umaxv $5, $2.8h // } .endm .macro HDM_QUANT_2x2_TOTAL_16BITS // { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working sshr $1.2d, $0.2d, #32 add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; zip1 $1.4h, $2.4h, $1.4h // } .endm .macro DC_ZERO_COUNT_IN_DUALWORD // { // input: coef, dst_d, working_d (all 0x01) cmeq $0.4h, $0.4h, #0 and $0.8b, $0.8b, $2.8b addv $1, $0.4h // } .endm .macro IHDM_4x4_TOTAL_16BITS // { // input: each src_d[0]~[3](dst), working_q0, working_q1 uzp2 $1.4s, $0.4s, $0.4s uzp1 $0.4s, $0.4s, $0.4s add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3] uzp2 $1.4s, $2.4s, $2.4s uzp1 $0.4s, $2.4s, $2.4s add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6]; zip1 $0.4s, $2.4s, $1.4s // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 // { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13] uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15] uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14] uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15] zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15] zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13] // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 // { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15] trn1 $4.8h, v0.8h, v1.8h trn2 $5.8h, v0.8h, v1.8h trn1 $6.8h, v2.8h, v3.8h trn2 $7.8h, v2.8h, v3.8h trn1 $0.4s, v4.4s, v6.4s trn2 $2.4s, v4.4s, v6.4s trn1 $1.4s, v5.4s, v7.4s trn2 $3.4s, v5.4s, v7.4s // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 // { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15] mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7] mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15] uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13] uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15] uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14] uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15] zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15] zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13] // } .endm .macro LOAD_4x4_DATA_FOR_DCT ld1 {$0.s}[0], [$2], $3 ld1 {$0.s}[1], [$2], $3 ld1 {$0.s}[2], [$2], $3 ld1 {$0.s}[3], [$2] ld1 {$1.s}[0], [$4], $5 ld1 {$1.s}[1], [$4], $5 ld1 {$1.s}[2], [$4], $5 ld1 {$1.s}[3], [$4] .endm .macro DCT_ROW_TRANSFORM_TOTAL_16BITS // { // input: src_d[0]~[3], working: [4]~[7] add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3]; sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3]; add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2]; sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2]; add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1]; sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1]; shl $1.8h, $7.8h, #1 shl $3.8h, $6.8h, #1 add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2]; sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1); // } .endm .macro LOAD_8x4_DATA_FOR_DCT // { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride ld1 {$0.d}[0], [$8], x2 ld1 {$1.d}[0], [$8], x2 ld1 {$2.d}[0], [$8], x2 ld1 {$3.d}[0], [$8], x2 ld1 {$4.d}[0], [$9], x4 ld1 {$5.d}[0], [$9], x4 ld1 {$6.d}[0], [$9], x4 ld1 {$7.d}[0], [$9], x4 // } .endm .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS // { // input: src_d[0]~[3], output: e_d[0]~[3]; add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2]; sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2]; sshr $6.8h, $1.8h, #1 sshr $7.8h, $3.8h, #1 sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3]; add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1); // } .endm .macro TRANSFORM_TOTAL_16BITS // both row & col transform used // { // output: f_q[0]~[3], input: e_q[0]~[3]; add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3]; add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2]; sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2]; sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3]; // } .endm .macro ROW_TRANSFORM_0_STEP // { // input: src_d[0]~[3], output: e_q[0]~[3]; saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2]; ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2]; ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3]; saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3]; // } .endm .macro COL_TRANSFORM_0_STEP // { // input: src_q[0]~[3], output: e_q[0]~[3]; add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j]; sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j]; sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); // } .endm .macro TRANSFORM_4BYTES // both row & col transform used // { // output: f_q[0]~[3], input: e_q[0]~[3]; add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3]; add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2]; sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2]; sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3]; // } .endm .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP // { // input: pred_d[0](output), dct_q0/1, working_q0/1; uxtl $3.8h, $0.8b uxtl2 $4.8h, $0.16b add $3.8h, $3.8h, $1.8h add $4.8h, $4.8h, $2.8h sqxtun $0.8b, $3.8h sqxtun2 $0.16b,$4.8h // } .endm #else .macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2 // { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q cmeq \arg0\().8h, \arg0\().8h, #0 cmeq \arg1\().8h, \arg1\().8h, #0 uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b ushr \arg0\().16b, \arg0\().16b, 7 addv \arg2\(), \arg0\().16b // } .endm .macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5 // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0; saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) smull \arg4\().4s, \arg1\().4h, \arg2\().4h smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h shrn \arg1\().4h, \arg4\().4s, #16 shrn2 \arg1\().8h, \arg5\().4s, #16 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched shl \arg3\().8h, \arg3\().8h, #1 sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x // } .endm .macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6 // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0; saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) smull \arg4\().4s, \arg1\().4h, \arg2\().4h smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h shrn \arg1\().4h, \arg4\().4s, #16 shrn2 \arg1\().8h, \arg5\().4s, #16 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched shl \arg3\().8h, \arg3\().8h, #1 mov \arg6\().16b, \arg1\().16b sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x // } .endm .macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 // if coef <= 0, - coef; else , coef; // { // input: coef, ff (dst), mf saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) smull \arg4\().4s, \arg1\().4h, \arg2\().4h shrn \arg1\().4h, \arg4\().4s, #16 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched shl \arg3\().8h, \arg3\().8h, #1 sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x // } .endm .macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5 // { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two) umax \arg0\().8h, \arg0\().8h, \arg1\().8h umaxv \arg4\(), \arg0\().8h umax \arg2\().8h, \arg2\().8h, \arg3\().8h umaxv \arg5\(), \arg2\().8h // } .endm .macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 // { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working sshr \arg1\().2d, \arg0\().2d, #32 add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h // } .endm .macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 // { // input: coef, dst_d, working_d (all 0x01) cmeq \arg0\().4h, \arg0\().4h, #0 and \arg0\().8b, \arg0\().8b, \arg2\().8b addv \arg1\(), \arg0\().4h // } .endm .macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 // { // input: each src_d[0]~[3](dst), working_q0, working_q1 uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3] uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6]; zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3 // { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13] uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15] uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14] uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15] zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15] zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13] // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15] trn1 \arg4\().8h, v0.8h, v1.8h trn2 \arg5\().8h, v0.8h, v1.8h trn1 \arg6\().8h, v2.8h, v3.8h trn2 \arg7\().8h, v2.8h, v3.8h trn1 \arg0\().4s, v4.4s, v6.4s trn2 \arg2\().4s, v4.4s, v6.4s trn1 \arg1\().4s, v5.4s, v7.4s trn2 \arg3\().4s, v5.4s, v7.4s // } .endm .macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3 // { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15] mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7] mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15] uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13] uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15] uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14] uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15] zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15] zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13] // } .endm .macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5 ld1 {\arg0\().s}[0], [\arg2\()], \arg3\() ld1 {\arg0\().s}[1], [\arg2\()], \arg3\() ld1 {\arg0\().s}[2], [\arg2\()], \arg3\() ld1 {\arg0\().s}[3], [\arg2\()] ld1 {\arg1\().s}[0], [\arg4\()], \arg5\() ld1 {\arg1\().s}[1], [\arg4\()], \arg5\() ld1 {\arg1\().s}[2], [\arg4\()], \arg5\() ld1 {\arg1\().s}[3], [\arg4\()] .endm .macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // { // input: src_d[0]~[3], working: [4]~[7] add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3]; sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3]; add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2]; sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2]; add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1]; sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1]; shl \arg1\().8h, \arg7\().8h, #1 shl \arg3\().8h, \arg6\().8h, #1 add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2]; sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1); // } .endm .macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 // { // input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride ld1 {\arg0\().d}[0], [\arg8\()], x2 ld1 {\arg1\().d}[0], [\arg8\()], x2 ld1 {\arg2\().d}[0], [\arg8\()], x2 ld1 {\arg3\().d}[0], [\arg8\()], x2 ld1 {\arg4\().d}[0], [\arg9\()], x4 ld1 {\arg5\().d}[0], [\arg9\()], x4 ld1 {\arg6\().d}[0], [\arg9\()], x4 ld1 {\arg7\().d}[0], [\arg9\()], x4 // } .endm .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // { // input: src_d[0]~[3], output: e_d[0]~[3]; add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2]; sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2]; sshr \arg6\().8h, \arg1\().8h, #1 sshr \arg7\().8h, \arg3\().8h, #1 sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3]; add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1); // } .endm .macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used // { // output: f_q[0]~[3], input: e_q[0]~[3]; add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3]; add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2]; sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2]; sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3]; // } .endm .macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // { // input: src_d[0]~[3], output: e_q[0]~[3]; saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2]; ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2]; ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3]; saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3]; // } .endm .macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // { // input: src_q[0]~[3], output: e_q[0]~[3]; add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j]; sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j]; sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); // } .endm .macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used // { // output: f_q[0]~[3], input: e_q[0]~[3]; add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3]; add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2]; sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2]; sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3]; // } .endm .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4 // { // input: pred_d[0](output), dct_q0/1, working_q0/1; uxtl \arg3\().8h, \arg0\().8b uxtl2 \arg4\().8h, \arg0\().16b add \arg3\().8h, \arg3\().8h, \arg1\().8h add \arg4\().8h, \arg4\().8h, \arg2\().8h sqxtun \arg0\().8b, \arg3\().8h sqxtun2 \arg0\().16b,\arg4\().8h // } .endm #endif WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon ld1 {v0.8h, v1.8h}, [x0] ZERO_COUNT_IN_2_QUARWORD v0, v1, b0 mov x0, v0.d[0] mov x1, #16 subs x0, x1, x0 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon ld1 {v2.8h}, [x1] ld1 {v0.8h, v1.8h}, [x0] ld1 {v3.8h}, [x2] mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7 st1 {v2.8h}, [x0], #16 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 st1 {v4.8h}, [x0], #16 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon ld1 {v0.8h, v1.8h}, [x0] dup v2.8h, w1 // even ff range [0, 768] dup v3.8h, w2 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7 st1 {v2.8h}, [x0], #16 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 st1 {v4.8h}, [x0], #16 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon ld1 {v2.8h}, [x1] ld1 {v3.8h}, [x2] mov x1, x0 .rept 4 ld1 {v0.8h, v1.8h}, [x0], #32 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS v0, v4, v3, v5, v6, v7 st1 {v4.8h}, [x1], #16 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 st1 {v4.8h}, [x1], #16 .endr WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon ld1 {v2.8h}, [x1] ld1 {v3.8h}, [x2] mov x1, x0 ld1 {v0.8h, v1.8h}, [x0], #32 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16 st1 {v4.8h}, [x1], #16 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17 st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17 ld1 {v0.8h, v1.8h}, [x0], #32 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18 st1 {v4.8h}, [x1], #16 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19 st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19 SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h20, h21 ld1 {v0.8h, v1.8h}, [x0], #32 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16 st1 {v4.8h}, [x1], #16 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17 st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17 ld1 {v0.8h, v1.8h}, [x0], #32 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18 st1 {v4.8h}, [x1], #16 mov.16b v4, v2 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19 st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19 SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h22, h23 st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3] WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon ld1 {v0.8h, v1.8h}, [x0] ld1 {v2.8h}, [x1] mul v3.8h, v0.8h, v2.8h mul v4.8h, v1.8h, v2.8h st1 {v3.8h, v4.8h}, [x0] WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon ld1 {v2.8h}, [x1] mov x1, x0 .rept 4 ld1 {v0.8h,v1.8h}, [x0], #32 mul v3.8h, v0.8h, v2.8h mul v4.8h, v1.8h, v2.8h st1 {v3.8h,v4.8h}, [x1], #32 .endr WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon dup v4.8h, w1 mov x1, #32 ld1 {v0.h}[0], [x0], x1 //rs[0] ld1 {v0.h}[1], [x0], x1 //rs[16] ld1 {v0.h}[2], [x0], x1 //rs[32] ld1 {v0.h}[3], [x0], x1 //rs[48] HDM_QUANT_2x2_TOTAL_16BITS v0, v1, v2 // output v1 HDM_QUANT_2x2_TOTAL_16BITS v1, v0, v2 // output v0 abs v1.4h, v0.4h cmhi v0.4h, v1.4h, v4.4h // abs(dct[i])>threshold; mov w0, v0.s[0] mov w1, v0.s[1] orr w0, w0, w1 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon dup v1.8h, w1 //ff dup v2.8h, w2 //mf eor v3.16b, v3.16b, v3.16b mov x1, #32 mov x2, x0 ld1 {v0.h}[0], [x0], x1 //rs[0] st1 {v3.h}[0], [x2], x1 //rs[00]=0 ld1 {v0.h}[1], [x0], x1 //rs[16] st1 {v3.h}[1], [x2], x1 //rs[16]=0 ld1 {v0.h}[2], [x0], x1 //rs[32] st1 {v3.h}[2], [x2], x1 //rs[32]=0 ld1 {v0.h}[3], [x0], x1 //rs[48] st1 {v3.h}[3], [x2], x1 //rs[48]=0 HDM_QUANT_2x2_TOTAL_16BITS v0, v4, v5 // output v4 HDM_QUANT_2x2_TOTAL_16BITS v4, v0, v5 // output v0 QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4 st1 {v1.d}[0], [x3] // store to dct st1 {v1.d}[0], [x4] // store to block movi v3.8h, #1, lsl #0 movi v0.16b, #255 DC_ZERO_COUNT_IN_DUALWORD v1, h0, v3 mov x0, v0.d[0] mov x1, #4 subs x0, x1, x0 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon ld1 {v0.8h, v1.8h}, [x0] dup v4.8h, w1 IHDM_4x4_TOTAL_16BITS v0, v2, v3 IHDM_4x4_TOTAL_16BITS v1, v2, v3 MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3 IHDM_4x4_TOTAL_16BITS v0, v2, v3 mul v0.8h, v0.8h, v4.8h IHDM_4x4_TOTAL_16BITS v1, v2, v3 mul v1.8h, v1.8h, v4.8h MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3 st1 {v0.16b, v1.16b}, [x0] WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon LOAD_4x4_DATA_FOR_DCT v0, v1, x1, x2, x3, x4 usubl v2.8h, v0.8b, v1.8b usubl2 v4.8h, v0.16b, v1.16b uzp1 v3.8h, v2.8h, v4.8h uzp2 v5.8h, v2.8h, v4.8h uzp2 v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15] uzp1 v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13] mov v3.d[0], v2.d[1] // s[3, 7, 11, 15] mov v1.d[0], v0.d[1] // s[1, 5, 9, 13] // horizontal transform DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 // transform element MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 // vertical transform DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0] WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon .rept 2 LOAD_8x4_DATA_FOR_DCT v0, v1, v2, v3, v4, v5, v6, v7, x1, x3 usubl v0.8h, v0.8b, v4.8b usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v6.8b usubl v3.8h, v3.8b, v7.8b MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 // horizontal transform DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 // transform element MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 // vertical transform DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 uzp1 v4.2d, v0.2d, v1.2d uzp2 v6.2d, v0.2d, v1.2d uzp1 v5.2d, v2.2d, v3.2d uzp2 v7.2d, v2.2d, v3.2d st1 {v4.16b, v5.16b}, [x0], #32 st1 {v6.16b, v7.16b}, [x0], #32 .endr WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon ld1 {v16.s}[0], [x2], x3 ld1 {v16.s}[1], [x2], x3 ld1 {v16.s}[2], [x2], x3 ld1 {v16.s}[3], [x2], x3 // Pred ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x4] // dct coeff ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 ins v0.d[1], v1.d[0] ins v2.d[1], v3.d[0] srshr v0.8h, v0.8h, #6 srshr v2.8h, v2.8h, #6 //after rounding 6, clip into [0, 255] uxtl v1.8h, v16.8b add v0.8h, v0.8h, v1.8h sqxtun v1.8b, v0.8h st1 {v1.s}[0],[x0],x1 st1 {v1.s}[1],[x0],x1 uxtl2 v1.8h, v16.16b add v2.8h, v2.8h, v1.8h sqxtun v1.8b, v2.8h st1 {v1.s}[0],[x0],x1 st1 {v1.s}[1],[x0],x1 WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon .rept 2 ld1 {v16.d}[0], [x2], x3 ld1 {v16.d}[1], [x2], x3 ld1 {v17.d}[0], [x2], x3 ld1 {v17.d}[1], [x2], x3 // Pred ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64 // dct coeff ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 srshr v0.8h, v0.8h, #6 srshr v1.8h, v1.8h, #6 srshr v2.8h, v2.8h, #6 srshr v3.8h, v3.8h, #6 //after rounding 6, clip into [0, 255] uxtl v4.8h, v16.8b add v0.8h, v0.8h, v4.8h sqxtun v0.8b, v0.8h st1 {v0.d}[0],[x0],x1 uxtl2 v5.8h, v16.16b add v1.8h, v1.8h, v5.8h sqxtun v1.8b, v1.8h st1 {v1.d}[0],[x0],x1 uxtl v6.8h, v17.8b add v2.8h, v2.8h, v6.8h sqxtun v2.8b, v2.8h st1 {v2.d}[0],[x0],x1 uxtl2 v7.8h, v17.16b add v3.8h, v3.8h, v7.8h sqxtun v3.8b, v3.8h st1 {v3.d}[0],[x0],x1 .endr WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon mov x2, #32 ld1 {v0.h}[0], [x1], x2 ld1 {v1.h}[0], [x1], x2 ld1 {v0.h}[1], [x1], x2 ld1 {v1.h}[1], [x1], x2 ld1 {v2.h}[0], [x1], x2 ld1 {v3.h}[0], [x1], x2 ld1 {v2.h}[1], [x1], x2 ld1 {v3.h}[1], [x1], x2 ld1 {v0.h}[2], [x1], x2 ld1 {v1.h}[2], [x1], x2 ld1 {v0.h}[3], [x1], x2 ld1 {v1.h}[3], [x1], x2 ld1 {v2.h}[2], [x1], x2 ld1 {v3.h}[2], [x1], x2 ld1 {v2.h}[3], [x1], x2 ld1 {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15] ROW_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5 TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5 // transform element 32bits uzp1 v4.4s, v0.4s, v1.4s // 0 2 4 6 uzp2 v5.4s, v0.4s, v1.4s // 1 3 5 7 uzp1 v6.4s, v2.4s, v3.4s // 8 10 12 14 uzp2 v7.4s, v2.4s, v3.4s // 9 11 13 15 uzp1 v0.4s, v4.4s, v6.4s // 0 4 8 12 uzp2 v2.4s, v4.4s, v6.4s // 2 6 10 14 uzp1 v1.4s, v5.4s, v7.4s // 1 5 9 13 uzp2 v3.4s, v5.4s, v7.4s // 3 7 11 15 COL_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5 TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5 sqrshrn v4.4h, v0.4s, #1 sqrshrn2 v4.8h, v1.4s, #1 sqrshrn v5.4h, v2.4s, #1 sqrshrn2 v5.8h, v3.4s, #1 st1 {v4.16b, v5.16b}, [x0] //store WELS_ASM_AARCH64_FUNC_END WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon ld1 {v16.16b,v17.16b}, [x4] srshr v16.8h, v16.8h, #6 srshr v17.8h, v17.8h, #6 dup v0.8h, v16.h[0] dup v1.8h, v16.h[1] ins v0.d[1], v1.d[0] dup v1.8h, v16.h[2] dup v2.8h, v16.h[3] ins v1.d[1], v2.d[0] .rept 4 ld1 {v3.16b}, [x2], x3 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 st1 {v3.16b}, [x0], x1 .endr dup v0.8h, v16.h[4] dup v1.8h, v16.h[5] ins v0.d[1], v1.d[0] dup v1.8h, v16.h[6] dup v2.8h, v16.h[7] ins v1.d[1], v2.d[0] .rept 4 ld1 {v3.16b}, [x2], x3 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 st1 {v3.16b}, [x0], x1 .endr dup v0.8h, v17.h[0] dup v1.8h, v17.h[1] ins v0.d[1], v1.d[0] dup v1.8h, v17.h[2] dup v2.8h, v17.h[3] ins v1.d[1], v2.d[0] .rept 4 ld1 {v3.16b}, [x2], x3 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 st1 {v3.16b}, [x0], x1 .endr dup v0.8h, v17.h[4] dup v1.8h, v17.h[5] ins v0.d[1], v1.d[0] dup v1.8h, v17.h[6] dup v2.8h, v17.h[7] ins v1.d[1], v2.d[0] .rept 4 ld1 {v3.16b}, [x2], x3 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 st1 {v3.16b}, [x0], x1 .endr WELS_ASM_AARCH64_FUNC_END #endif