ref: b001785eeeff8d165ff8ee5fdb564600a8b1ece8
dir: /codec/processing/src/arm/adaptive_quantization.S/
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
#include "arm_arch_common_macro.S"
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
mov r4, #15
pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
vpadal.u8 q13, q2
subs r4, #1
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
ldr r4, [sp, #4]
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
vst2.16 {d0[0], d1[0]}, [r4]
ldmia sp!, {r4}
WELS_ASM_FUNC_END
#endif