ref: 2c6185a22d5a2b53c1ee002aa1db2a2ed2af311b
dir: /codec/processing/src/arm/down_sample_neon.S/
/*! * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifdef HAVE_NEON #include "arm_arch_common_macro.S" WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon stmdb sp!, {r4-r8, lr} //Get the width and height ldr r4, [sp, #24] //src_width ldr r5, [sp, #28] //src_height //Initialize the register mov r6, r2 mov r8, r0 mov lr, #0 lsr r5, #1 //Save the tailer for the unasigned size mla r7, r1, r5, r0 vld1.32 {q15}, [r7] add r7, r2, r3 //processing a colume data comp_ds_bilinear_loop0: vld1.8 {q0,q1}, [r2]! vld1.8 {q2,q3}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vrshr.u16 q0, #1 vrshr.u16 q1, #1 vrshr.u16 q2, #1 vrshr.u16 q3, #1 vrhadd.u16 q0, q2 vrhadd.u16 q1, q3 vmovn.u16 d0, q0 vmovn.u16 d1, q1 vst1.32 {q0}, [r0]! add lr, #32 cmp lr, r4 movcs lr, #0 addcs r6, r6, r3, lsl #1 movcs r2, r6 addcs r7, r2, r3 addcs r8, r1 movcs r0, r8 subscs r5, #1 bne comp_ds_bilinear_loop0 //restore the tailer for the unasigned size vst1.32 {q15}, [r0] ldmia sp!, {r4-r8,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon stmdb sp!, {r4-r7, lr} //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height //Get the difference sub lr, r3, r4 sub r1, r1, r4, lsr #1 lsr r5, #1 //processing a colume data comp_ds_bilinear_w_x8_loop0: lsr r6, r4, #3 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x8_loop1: vld1.8 {d0}, [r2]! vld1.8 {d1}, [r7]! vpaddl.u8 q0, q0 vrshr.u16 q0, #1 vrhadd.u16 d0, d1 vmovn.u16 d0, q0 vst1.32 {d0[0]}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x8_loop1 add r2, r7, lr add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x8_loop0 ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon stmdb sp!, {r4-r7, lr} //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height //Get the difference sub lr, r3, r4 sub r1, r1, r4, lsr #1 lsr r5, #1 //processing a colume data comp_ds_bilinear_w_x16_loop0: lsr r6, r4, #4 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x16_loop1: vld1.8 {q0}, [r2]! vld1.8 {q1}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vrshr.u16 q0, #1 vrshr.u16 q1, #1 vrhadd.u16 q0, q1 vmovn.u16 d0, q0 vst1.32 {d0}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x16_loop1 add r2, r7, lr add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x16_loop0 ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon stmdb sp!, {r4-r7, lr} //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height //Get the difference sub lr, r3, r4 sub r1, r1, r4, lsr #1 lsr r5, #1 //processing a colume data comp_ds_bilinear_w_x32_loop0: lsr r6, r4, #5 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x32_loop1: vld1.8 {q0,q1}, [r2]! vld1.8 {q2,q3}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vrshr.u16 q0, #1 vrshr.u16 q1, #1 vrshr.u16 q2, #1 vrshr.u16 q3, #1 vrhadd.u16 q0, q2 vrhadd.u16 q1, q3 vmovn.u16 d0, q0 vmovn.u16 d1, q1 vst1.32 {q0}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x32_loop1 add r2, r7, lr add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x32_loop0 ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon stmdb sp!, {r4-r12, lr} //Get the data from stack ldr r4, [sp, #40] //the addr of src ldr r5, [sp, #44] //the value of src_stride ldr r6, [sp, #48] //the value of scaleX ldr r7, [sp, #52] //the value of scaleY mov r10, #32768 sub r10, #1 and r8, r6, r10 // r8 uinc(scaleX mod 32767) mov r11, #-1 mul r11, r8 // r11 -uinc vdup.s16 d2, r8 vdup.s16 d0, r11 vzip.s16 d0, d2 // uinc -uinc uinc -uinc and r9, r7, r10 // r9 vinc(scaleY mod 32767) mov r11, #-1 mul r11, r9 // r11 -vinc vdup.s16 d2, r9 vdup.s16 d3, r11 vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc mov r11, #0x40000000 mov r12, #0x4000 sub r12, #1 add r11, r12 vdup.s32 d1, r11; //init u 16384 16383 16384 16383 mov r11, #16384 vdup.s16 d16, r11 sub r11, #1 vdup.s16 d17, r11 vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383 veor q14, q14 sub r1, r2 // stride - width mov r8, #16384 // yInverse sub r3, #1 _HEIGHT: ldr r4, [sp, #40] //the addr of src mov r11, r8 lsr r11, #15 mul r11, r5 add r11, r4 // get current row address mov r12, r11 add r12, r5 mov r9, #16384 // xInverse sub r10, r2, #1 vmov.s16 d6, d1 _WIDTH: mov lr, r9 lsr lr, #15 add r4, r11,lr vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a; add r4, r12,lr vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; vzip.32 d28, d29 //q14: 000d000c000b000a; vmull.u16 q13, d6, d7 //q13: init u * init v vmull.u32 q12, d26,d28 vmlal.u32 q12, d27,d29 vqadd.u64 d24, d24,d25 vrshr.u64 d24, #30 vst1.8 {d24[0]}, [r0]! add r9, r6 vadd.u16 d6, d0 // inc u vshl.u16 d6, #1 vshr.u16 d6, #1 subs r10, #1 bne _WIDTH WIDTH_END: lsr r9, #15 add r4,r11,r9 vld1.8 {d24[0]}, [r4] vst1.8 {d24[0]}, [r0] add r0, #1 add r8, r7 add r0, r1 vadd.s16 d7, d5 // inc v vshl.u16 d7, #1 vshr.u16 d7, #1 subs r3, #1 bne _HEIGHT LAST_ROW: ldr r4, [sp, #40] //the addr of src lsr r8, #15 mul r8, r5 add r4, r8 // get current row address mov r9, #16384 _LAST_ROW_WIDTH: mov r11, r9 lsr r11, #15 add r3, r4,r11 vld1.8 {d0[0]}, [r3] vst1.8 {d0[0]}, [r0] add r0, #1 add r9, r6 subs r2, #1 bne _LAST_ROW_WIDTH ldmia sp!, {r4-r12, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon stmdb sp!, {r4-r8, lr} //Get the width and height ldr r4, [sp, #24] //src_width ldr r5, [sp, #28] //src_height //Initialize the register mov r6, r2 mov r8, r0 mov lr, #0 //Save the tailer for the un-aligned size mla r7, r1, r5, r0 vld1.32 {q15}, [r7] add r7, r2, r3 //processing a colume data comp_ds_bilinear_onethird_loop0: vld3.8 {d0, d1, d2}, [r2]! vld3.8 {d3, d4, d5}, [r2]! vld3.8 {d16, d17, d18}, [r7]! vld3.8 {d19, d20, d21}, [r7]! vaddl.u8 q11, d0, d1 vaddl.u8 q12, d3, d4 vaddl.u8 q13, d16, d17 vaddl.u8 q14, d19, d20 vrshr.u16 q11, #1 vrshr.u16 q12, #1 vrshr.u16 q13, #1 vrshr.u16 q14, #1 vrhadd.u16 q11, q13 vrhadd.u16 q12, q14 vmovn.u16 d0, q11 vmovn.u16 d1, q12 vst1.8 {q0}, [r0]! add lr, #48 cmp lr, r4 movcs lr, #0 addcs r6, r6, r3, lsl #1 addcs r6, r6, r3 movcs r2, r6 addcs r7, r2, r3 addcs r8, r1 movcs r0, r8 subscs r5, #1 bne comp_ds_bilinear_onethird_loop0 //restore the tailer for the un-aligned size vst1.32 {q15}, [r0] ldmia sp!, {r4-r8,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon stmdb sp!, {r4-r8, lr} //Get the width and height ldr r4, [sp, #24] //src_width ldr r5, [sp, #28] //src_height //Initialize the register mov r6, r2 mov r8, r0 mov lr, #0 lsr r5, #2 //Save the tailer for the un-aligned size mla r7, r1, r5, r0 vld1.32 {q15}, [r7] add r7, r2, r3 //processing a colume data comp_ds_bilinear_quarter_loop0: vld2.16 {q0, q1}, [r2]! vld2.16 {q2, q3}, [r2]! vld2.16 {q8, q9}, [r7]! vld2.16 {q10, q11}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q2, q2 vpaddl.u8 q8, q8 vpaddl.u8 q10, q10 vrshr.u16 q0, #1 vrshr.u16 q2, #1 vrshr.u16 q8, #1 vrshr.u16 q10, #1 vrhadd.u16 q0, q8 vrhadd.u16 q2, q10 vmovn.u16 d0, q0 vmovn.u16 d1, q2 vst1.8 {q0}, [r0]! add lr, #64 cmp lr, r4 movcs lr, #0 addcs r6, r6, r3, lsl #2 movcs r2, r6 addcs r7, r2, r3 addcs r8, r1 movcs r0, r8 subscs r5, #1 bne comp_ds_bilinear_quarter_loop0 //restore the tailer for the un-aligned size vst1.32 {q15}, [r0] ldmia sp!, {r4-r8,lr} WELS_ASM_FUNC_END #endif