shithub: openh264

Download patch

ref: 57fc3e991792ea277a309bcc9351bb800d46b380
parent: eb9f56584fae81eab9be6ab999040ed5e4a7cfcd
author: Sindre Aamås <saamas@cisco.com>
date: Fri Apr 8 13:05:38 EDT 2016

[Processing] Add AVX2 VAA routines

Process 8 lines at a time rather than 16 lines at a time because
this appears to give more reliable memory subsystem performance on
Haswell.

Speedup is > 2x as compared to SSE2 when not memory-bound on Haswell.
On my Haswell MBP, VAACalcSadSsdBgd is about ~3x faster when uncached,
which appears to be related to processing 8 lines at a time as opposed
to 16 lines at a time. The other routines are also faster as compared
to the SSE2 routines in this case but to a lesser extent.

--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -64,6 +64,13 @@
     sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
     sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_sse2;
   }
+  if (iCpuFlag & WELS_CPU_AVX2) {
+    sVaaFuncs.pfVAACalcSad       = VAACalcSad_avx2;
+    sVaaFuncs.pfVAACalcSadBgd    = VAACalcSadBgd_avx2;
+    sVaaFuncs.pfVAACalcSadSsd    = VAACalcSadSsd_avx2;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_avx2;
+    sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_avx2;
+  }
 #endif//X86_ASM
 #ifdef HAVE_NEON
   if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -104,6 +104,11 @@
 VAACalcSadFunc          VAACalcSad_sse2;
 VAACalcSadVarFunc       VAACalcSadVar_sse2;
 VAACalcSadSsdFunc       VAACalcSadSsd_sse2;
+VAACalcSadBgdFunc       VAACalcSadBgd_avx2;
+VAACalcSadSsdBgdFunc    VAACalcSadSsdBgd_avx2;
+VAACalcSadFunc          VAACalcSad_avx2;
+VAACalcSadVarFunc       VAACalcSadVar_avx2;
+VAACalcSadSsdFunc       VAACalcSadSsd_avx2;
 WELSVP_EXTERN_C_END
 #endif
 
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -2028,3 +2028,1532 @@
 %undef          localsize
     ret
 %endif
+
+%ifdef X86_32
+%define ptrword dword
+%else
+%define ptrword qword
+%endif
+
+%define xmm_width 16
+%define ymm_width 32
+
+%macro PUSHM 1-*
+    %rep %0
+        push           %1
+        %rotate 1
+    %endrep
+    %assign push_num push_num + %0
+%endmacro
+
+%macro POPM 1-*
+    %rep %0
+        %rotate -1
+        pop            %1
+    %endrep
+    %assign push_num push_num - %0
+%endmacro
+
+%ifdef X86_32
+%define stack_alloc_min 4
+%else
+%define stack_alloc_min 8
+%endif
+
+; Allocate aligned stack space.
+; address_out=%1 size=%2 alignment=%3
+%macro STACK_ALLOC 3
+%if (%3) & ((%3) - 1)
+    %error non-power-of-2 alignment requested.
+%endif
+%if (%3) > 0
+    %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min
+%else
+    %assign stack_alloc_align 1
+%endif
+    %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1
+    %assign push_num push_num + stack_alloc_num
+    sub            r7, stack_alloc_min * stack_alloc_num
+%if stack_alloc_align == 1
+    mov            %1, r7
+%else
+    lea            %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)]
+    and            %1, -(stack_alloc_min * stack_alloc_align)
+%endif
+%endmacro
+
+; Deallocate stack space allocated with STACK_ALLOC.
+%macro STACK_DEALLOC 0
+    add            r7, stack_alloc_min * stack_alloc_num
+    %assign push_num push_num - stack_alloc_num
+%endmacro
+
+; Max unsigned byte per quadword
+; out=%1 in=%2 tmp=%3
+%macro AVX2_Maxubq 3
+    vpsrlq         %3, %2, 32
+    vpmaxub        %1, %2, %3
+    vpsrlq         %3, %1, 16
+    vpmaxub        %1, %1, %3
+    vpsrlq         %3, %1,  8
+    vpmaxub        %1, %1, %3
+%endmacro
+
+; Max unsigned byte per quadword. 2 register input.
+; Results interleaved as least significant byte of even/odd doublewords.
+; out=%1 in_a=%2 in_b=%3 tmp=%4
+%macro AVX2_Maxubq2 4
+    vpblendd       %4, %2, %3, 10101010b
+    vpshufd        %4, %4, 10110001b
+    vpblendd       %1, %2, %3, 01010101b
+    vpmaxub        %1, %4, %1
+    vpsrld         %4, %1, 16
+    vpmaxub        %1, %1, %4
+    vpsrld         %4, %1,  8
+    vpmaxub        %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sqsumbdw 5
+    vpunpcklbw     %4, %2, %3
+%if %5
+    vpmaddwd       %4, %4, %4
+    vpaddd         %1, %1, %4
+%else
+    vpmaddwd       %1, %4, %4
+%endif
+    vpunpckhbw     %4, %2, %3
+    vpmaddwd       %4, %4, %4
+    vpaddd         %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sumbdw 5
+%if %5
+    vpsadbw        %4, %2, %3
+    vpaddd         %1, %1, %4
+%else
+    vpsadbw        %1, %2, %3
+%endif
+%endmacro
+
+; res=%1 a=%2 b=%3 a=%4 tmp=%5
+%macro AVX2_AbsDiffub 5
+    vpsubusb       %5, %2, %3
+    vpsubusb       %1, %3, %4
+    vpor           %1, %5, %1
+%endmacro
+
+; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sadbdw 5
+%if %5
+    vpsadbw        %4, %2, %3
+    vpaddd         %1, %1, %4
+%else
+    vpsadbw        %1, %2, %3
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8
+%macro AVX2_SadSumSqsumbdw 8
+    AVX2_Sadbdw    %1, %4, %5, %7, %8
+    AVX2_Sumbdw    %2, %4, %6, %7, %8
+    AVX2_Sqsumbdw  %3, %4, %6, %7, %8
+%endmacro
+
+; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sad 5
+    vmovdqu        %4, [%2]
+    AVX2_Sadbdw    %1, %4, [%3], %4, %5
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9
+%macro AVX2_SadSumSqsum 9
+    vmovdqu        %7, [%4]
+    AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSumSqsumSqdiff 11
+    vmovdqu        %8,  [%5]
+    vmovdqu        %9,  [%6]
+    AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11
+    AVX2_AbsDiffub %9,  %8,  %9,  %8,  %10
+    AVX2_Sqsumbdw  %4,  %9,  %7,  %10, %11
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSdMad 11
+    vmovdqu        %8,  [%5]
+    vmovdqu        %9,  [%6]
+    AVX2_Sumbdw    %2,  %8,  %7,  %10, %11
+    AVX2_Sumbdw    %3,  %9,  %7,  %10, %11
+    AVX2_Sadbdw    %1,  %8,  %9,  %10, %11
+%if %11
+    AVX2_AbsDiffub %9,  %8,  %9,  %8, %10
+    vpmaxub        %4,  %4,  %9
+%else
+    AVX2_AbsDiffub %4,  %8,  %9,  %8, %10
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13
+%macro AVX2_SadBgdSqdiff 13
+%ifidn %12, 0
+    vmovdqu        %10, [%7]
+    AVX2_Sumbdw    %2,  %10, %9,  %11, %13
+    AVX2_Sqsumbdw  %6,  %10, %9,  %11, %13
+    vmovdqu        %11, [%8]
+    AVX2_Sadbdw    %1,  %10, %11, %10, %13
+    AVX2_Sumbdw    %3,  %11, %9,  %10, %13
+    vmovdqu        %10, [%7]
+%if %13
+    AVX2_AbsDiffub %11, %10, %11, [%7], %10
+    vpmaxub        %4,  %4,  %11
+    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13
+%else
+    AVX2_AbsDiffub %4,  %10, %11, [%7], %10
+    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13
+%endif
+%else
+    vmovdqu        %10, [%7]
+    vmovdqu        %11, [%8]
+    AVX2_Sadbdw    %1,  %10, %11, %12, %13
+    AVX2_Sumbdw    %2,  %10, %9,  %12, %13
+    AVX2_Sumbdw    %3,  %11, %9,  %12, %13
+    AVX2_Sqsumbdw  %6,  %10, %9,  %12, %13
+%if %13
+    AVX2_AbsDiffub %11, %10, %11, %10, %12
+    vpmaxub        %4,  %4,  %11
+    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13
+%else
+    AVX2_AbsDiffub %4,  %10, %11, %10, %12
+    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accdw 5
+    vpshufd        %2%4, %2%3, 1000b
+%ifidni %2, x
+    vmovlps        [%1 + 8 * %5], x%4
+%elif %5 == 0
+    vmovdqu        [%1], %2%4
+%else
+    vmovlps        [%1 +  8], x%4
+    vextracti128   x%4, %2%4, 1
+    vmovlps        [%1 + 24], x%4
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accb 5
+    vpunpckhqdq    %2%4, %2%3, %2%3
+    vpunpcklbw     %2%4, %2%3, %2%4
+%if %5 == 0
+    vmovd          [%1 + 0], x%4
+%ifidni %2, y
+    vextracti128   x%4, %2%4, 1
+    vmovd          [%1 + 4], x%4
+%endif
+%else
+    vpextrw        [%1 + 2], x%4, 0
+%ifidni %2, y
+    vextracti128   x%4, %2%4, 1
+    vpextrw        [%1 + 6], x%4, 0
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5
+%macro AVX2_Store2x8x8Accb 5
+    vpunpckhqdq    y%3, y%2, y%2
+    vpunpcklbw     y%3, y%2, y%3
+    vextracti128   x%4, y%3, 1
+    vpsllq         x%4, x%4, 32
+    vpblendd       x%4, x%3, x%4, 1010b
+%if %5
+    vpslld         x%4, x%4, 16
+    vpblendw       x%4, x%4, [%1], 01010101b
+%endif
+    vmovdqu        [%1], x%4
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5
+%macro AVX2_Store16x16Accdw 5
+%ifidni %2, x
+%if %5
+    vmovd          x%4, [%1 + 0]
+    vpaddd         x%3, x%4, x%3
+%endif
+    vmovd          [%1 + 0], x%3
+%elif %5 == 0
+    vmovd          [%1 + 0], x%3
+    vextracti128   x%3, %2%3, 1
+    vmovd          [%1 + 4], x%3
+%else
+    vextracti128   x%4, %2%3, 1
+    vpunpckldq     x%4, x%3, x%4
+    vmovq          x%3, [%1 + 0]
+    vpaddd         x%3, x%3, x%4
+    vmovlps        [%1 + 0], x%3
+%endif
+%endmacro
+
+; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8
+%macro AVX2_Store2x16x16Accdw 8
+%ifidni %5, x
+    mov            %4, %1
+%if %8 == 0
+    vmovd          [%4 + %3], x%6
+    mov            %4, %2
+    vpextrd        [%4 + %3], x%6, 2
+%else
+    vmovd          x%7, [%4 + %3]
+    vpaddd         x%7, x%7, x%6
+    vmovd          [%4 + %3], x%7
+    mov            %4, %2
+    vpbroadcastd   x%7, [%4 + %3]
+    vpaddd         x%7, x%7, x%6
+    vpextrd        [%4 + %3], x%7, 2
+%endif
+%else
+    vextracti128   x%7, %5%6, 1
+    vpblendd       x%6, x%6, x%7, 1010b
+    mov            %4, %1
+%if %8 == 0
+    vmovlps        [%4 + %3], x%6
+    mov            %4, %2
+    vmovhps        [%4 + %3], x%6
+%else
+    vmovq          x%7, [%4 + %3]
+    vpaddd         x%7, x%7, x%6
+    vmovlps        [%4 + %3], x%7
+    mov            %4, %2
+    vpbroadcastq   x%7, [%4 + %3]
+    vpaddd         x%7, x%7, x%6
+    vmovhps        [%4 + %3], x%7
+%endif
+%endif
+%endmacro
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSad_8Lines 7
+%define mm_tmp0    %2
+%define mm_sad     %3
+%define mm_sad2    %4
+%define mm_sad3    %5
+%define mm_sad4    %6
+%define b_second_blocks %7
+%ifdef i_stride5
+    %define i_stride5_ i_stride5
+%else
+    lea            r_tmp, [5 * i_stride]
+    %define i_stride5_ r_tmp
+%endif
+    ; Use multiple accumulators to shorten dependency chains and enable more parallelism.
+    AVX2_Sad       %1 %+ mm_sad,  p_cur,                  p_ref,                  %1 %+ mm_tmp0, 0
+    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_tmp0, 0
+    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_tmp0, 0
+    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_tmp0, 0
+    AVX2_Sad       %1 %+ mm_sad,  p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_tmp0, 1
+    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1
+%ifdef i_stride7
+    %define i_stride7_ i_stride7
+%else
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+    %define i_stride7_ r_tmp
+%endif
+    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_tmp0, 1
+    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1
+%undef i_stride5_
+%undef i_stride7_
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    add            p_cur, %1 %+ mm_width
+    add            p_ref, %1 %+ mm_width
+    ; Collapse accumulators.
+    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad2
+    vpaddd         %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4
+    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad3
+    AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks
+    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+%undef mm_tmp0
+%undef mm_sad
+%undef mm_sad2
+%undef mm_sad3
+%undef mm_sad4
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSad_avx2
+%define          p_sadframe                    ptrword arg6
+%define          p_sad8x8                      ptrword arg7
+%ifdef X86_32
+%define          saveregs                      r5, r6
+%else
+%define          saveregs                      rbx, rbp, r12
+%endif
+
+%assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    PUSHM          saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+    vmovdqa        y %+ mm_sadframe, y %+ mm_zero
+
+    and            r2, -16                     ; iPicWidth &= -16
+    jle            .done                       ; bail if iPicWidth < 16
+    sar            r3, 4                       ; iPicHeight / 16
+    jle            .done                       ; bail if iPicHeight < 16
+    shr            r2, 2                       ; iPicWidth / 4
+
+%define p_cur     r0
+%define p_ref     r1
+%define i_xcnt    r2
+%define i_ycnt    ptrword arg4
+%define i_stride  r4
+%define xcnt_unit 4
+%ifdef X86_32
+    mov            i_ycnt, r3
+    mov            r5, p_sad8x8
+    %define i_stride3 r3
+    %undef  p_sad8x8
+    %define p_sad8x8  r5
+    %define r_tmp     r6
+    lea            i_stride3, [3 * i_stride]
+%else
+    mov            rbp, p_sad8x8
+    %define i_stride3 rbx
+    %define i_stride5 r12
+    %define i_stride7 r6
+    %undef  p_sad8x8
+    %define p_sad8x8  rbp
+    lea            i_stride3, [3 * i_stride]
+    lea            i_stride5, [5 * i_stride]
+    lea            i_stride7, [i_stride + 2 * i_stride3]
+%endif
+
+    ; offset pointer so as to compensate for the i_xcnt offset below.
+    sub            p_sad8x8, 4 * 16 / xcnt_unit
+
+    push           i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+
+.height_loop:
+    ; use end-of-line pointers so as to enable use of a negative counter as index.
+    lea            p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt]
+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+    neg            i_xcnt
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_upper8_remaining16
+.width_loop_upper8:
+    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_upper8
+    jg             .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    xor            i_xcnt, i_xcnt
+    sub            i_xcnt, i_xcnt_load
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_lower8_remaining16
+.width_loop_lower8:
+    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_lower8
+    jg             .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    xor            i_xcnt, i_xcnt
+    sub            i_xcnt, i_xcnt_load
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    neg            i_xcnt
+    sub            i_ycnt, 1
+    jnz            .height_loop
+
+    pop            i_xcnt
+%assign push_num push_num - 1
+%undef i_xcnt_load
+
+.done:
+    mov            r6, p_sadframe
+    vextracti128   xmm2, y %+ mm_sadframe, 1
+    vpaddd         xmm2, x %+ mm_sadframe, xmm2
+    vpunpckhqdq    xmm1, xmm2, xmm2
+    vpaddd         xmm2, xmm2, xmm1
+    vmovd          [r6], xmm2
+    vzeroupper
+
+    POPM           saveregs
+    POP_XMM
+    LOAD_5_PARA_POP
+%undef           p_cur
+%undef           p_ref
+%undef           i_xcnt
+%undef           i_ycnt
+%undef           i_stride
+%undef           r_tmp
+%undef           xcnt_unit
+%undef           i_stride3
+%undef           i_stride5
+%undef           i_stride7
+%undef           mm_sadframe
+%undef           mm_zero
+%undef           saveregs
+%undef           p_sadframe
+%undef           p_sad8x8
+    ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSadVar_8Lines 7
+%define mm_tmp0    %2
+%define mm_tmp1    %3
+%define mm_sad     %4
+%define mm_sum     %5
+%define mm_sqsum   %6
+%define b_second_blocks %7
+    ; Unroll for better performance on Haswell.
+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+    lea            r_tmp, [5 * i_stride]
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    add            p_cur, %1 %+ mm_width
+    add            p_ref, %1 %+ mm_width
+%else
+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+    lea            r_tmp, [8 * i_stride]
+    add            p_cur, r_tmp
+    add            p_ref, r_tmp
+    neg            r_tmp
+%%loop:
+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+    add            r_tmp, i_stride
+    jl             %%loop
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+%endif
+    AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum
+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+    AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadVar_avx2
+%define          p_sadframe                    ptrword arg6
+%define          p_sad8x8                      ptrword arg7
+%define          p_sum16x16                    ptrword arg8
+%define          p_sqsum16x16                  ptrword arg9
+%ifdef X86_32
+%define          saveregs                      r5, r6
+%else
+%define          saveregs                      rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    PUSHM          saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+    vmovdqa        y %+ mm_sadframe, y %+ mm_zero
+
+    and            r2, -16                     ; iPicWidth &= -16
+    jle            .done                       ; bail if iPicWidth < 16
+    sar            r3, 4                       ; iPicHeight / 16
+    jle            .done                       ; bail if iPicHeight < 16
+    shr            r2, 2                       ; iPicWidth / 4
+
+%define p_cur     r0
+%define p_ref     r1
+%define i_xcnt    r2
+%define i_ycnt    ptrword arg4
+%define i_stride  r4
+%define r_tmp     r6
+%define xcnt_unit 4
+%ifdef X86_32
+    mov            i_ycnt, r3
+    mov            r3, p_sad8x8
+    %undef  p_sad8x8
+    %define p_sad8x8 r3
+    %define i_stride3 r5
+%else
+    mov            rbp, p_sad8x8
+    mov            r12, p_sum16x16
+    mov            r13, p_sqsum16x16
+    %undef  p_sad8x8
+    %undef  p_sum16x16
+    %undef  p_sqsum16x16
+    %define p_sad8x8 rbp
+    %define p_sum16x16 r12
+    %define p_sqsum16x16 r13
+    %define i_stride3 rbx
+%endif
+    lea            i_stride3, [3 * i_stride]
+
+    ; offset pointers so as to compensate for the i_xcnt offset below.
+    sub            p_sad8x8,      4 * 16 / xcnt_unit
+    sub            p_sum16x16,    1 * 16 / xcnt_unit
+    sub            p_sqsum16x16,  1 * 16 / xcnt_unit
+
+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+    neg            i_xcnt
+
+.height_loop:
+    push           i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+    ; use end-of-line pointers so as to enable use of a negative counter as index.
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_sad8x8, r_tmp
+    sub            p_sum16x16, i_xcnt
+    sub            p_sqsum16x16, i_xcnt
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_upper8_remaining16
+.width_loop_upper8:
+    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_upper8
+    jg             .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    mov            i_xcnt, i_xcnt_load
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_lower8_remaining16
+.width_loop_lower8:
+    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_lower8
+    jg             .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+    pop            i_xcnt
+    %assign push_num push_num - 1
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    sub            i_ycnt, 1
+    jnz            .height_loop
+
+.done:
+    mov            r_tmp, p_sadframe
+    vextracti128   xmm2, y %+ mm_sadframe, 1
+    vpaddd         xmm2, x %+ mm_sadframe, xmm2
+    vpunpckhqdq    xmm1, xmm2, xmm2
+    vpaddd         xmm2, xmm2, xmm1
+    vmovd          [r_tmp], xmm2
+    vzeroupper
+
+    POPM           saveregs
+    POP_XMM
+    LOAD_5_PARA_POP
+%undef           p_cur
+%undef           p_ref
+%undef           i_xcnt
+%undef           i_ycnt
+%undef           i_stride
+%undef           i_stride3
+%undef           r_tmp
+%undef           xcnt_unit
+%undef           mm_sadframe
+%undef           mm_zero
+%undef           saveregs
+%undef           p_sadframe
+%undef           p_sad8x8
+%undef           p_sum16x16
+%undef           p_sqsum16x16
+    ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadSsd_8Lines 9
+%define mm_tmp0    %2
+%define mm_tmp1    %3
+%define mm_tmp2    %4
+%define mm_sad     %5
+%define mm_sum     %6
+%define mm_sqsum   %7
+%define mm_sqdiff  %8
+%define b_second_blocks %9
+    ; Unroll for better performance on Haswell.
+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+%ifdef i_stride5
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+    %define i_stride5_ i_stride5
+%else
+    lea            r_tmp, [5 * i_stride]
+    %define i_stride5_ r_tmp
+%endif
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur,                  p_ref,                  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+%ifndef i_stride5
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+%endif
+%undef i_stride5_
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp,          p_ref + r_tmp,          %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    add            p_cur, %1 %+ mm_width
+    add            p_ref, %1 %+ mm_width
+%else
+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+    lea            r_tmp, [8 * i_stride]
+    add            p_cur, r_tmp
+    add            p_ref, r_tmp
+    neg            r_tmp
+%%loop:
+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    add            r_tmp, i_stride
+    jl             %%loop
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+%endif
+    mov            r_tmp, p_sad8x8
+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+    vmovdqa        sadframe_acc, y %+ mm_tmp1
+%else
+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+    mov            r_tmp, i_xcnt
+    add            r_tmp, p_sum16x16
+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsd_avx2
+%define          p_sadframe                    ptrword arg6
+%define          p_sad8x8                      ptrword arg7
+%define          p_sum16x16                    ptrword arg8
+%define          p_sqsum16x16                  ptrword arg9
+%define          p_sqdiff16x16                 ptrword arg10
+%ifdef X86_32
+%define          saveregs                      r5, r6
+%else
+%define          saveregs                      rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 9
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    PUSHM          saveregs
+
+%define mm_zero mm0
+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+    STACK_ALLOC    r5, ymm_width, ymm_width
+    %define sadframe_acc_addr r5
+    %define sadframe_acc [sadframe_acc_addr]
+%else
+    %define sadframe_acc ymm8
+    %define xsadframe_acc xmm8
+%endif
+    vmovdqa        sadframe_acc, y %+ mm_zero
+
+    and            r2, -16                     ; iPicWidth &= -16
+    jle            .done                       ; bail if iPicWidth < 16
+    sar            r3, 4                       ; iPicHeight / 16
+    jle            .done                       ; bail if iPicHeight < 16
+    shr            r2, 2                       ; iPicWidth / 4
+
+%define p_cur     r0
+%define p_ref     r1
+%define i_xcnt    r2
+%define i_ycnt    ptrword arg4
+%define i_stride  r4
+%define r_tmp     r6
+%define xcnt_unit 4
+%ifdef X86_32
+    mov            i_ycnt, r3
+    %define i_stride3 r3
+%else
+    mov            r12, p_sad8x8
+    mov            r13, p_sum16x16
+    mov            r14, p_sqsum16x16
+    mov            r15, p_sqdiff16x16
+    %undef  p_sad8x8
+    %undef  p_sum16x16
+    %undef  p_sqsum16x16
+    %undef  p_sqdiff16x16
+    %define p_sad8x8 r12
+    %define p_sum16x16 r13
+    %define p_sqsum16x16 r14
+    %define p_sqdiff16x16 r15
+    %define i_stride3 rbx
+    %define i_stride5 rbp
+    lea            i_stride5, [5 * i_stride]
+%endif
+    lea            i_stride3, [3 * i_stride]
+
+    ; offset pointers so as to compensate for i_xcnt offset below.
+    sub            p_sad8x8,      4 * 16 / xcnt_unit
+    sub            p_sum16x16,    1 * 16 / xcnt_unit
+    sub            p_sqsum16x16,  1 * 16 / xcnt_unit
+    sub            p_sqdiff16x16, 1 * 16 / xcnt_unit
+
+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+    neg            i_xcnt
+
+.height_loop:
+    push           i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+    ; use end-of-line pointers so as to enable use of a negative counter as index.
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_sad8x8, r_tmp
+    sub            p_sum16x16, i_xcnt
+    sub            p_sqsum16x16, i_xcnt
+    sub            p_sqdiff16x16, i_xcnt
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_upper8_remaining16
+.width_loop_upper8:
+    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_upper8
+    jg             .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    mov            i_xcnt, i_xcnt_load
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    add            i_xcnt, 16 / xcnt_unit
+    jz             .width_loop_lower8_remaining16
+.width_loop_lower8:
+    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_lower8
+    jg             .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+    pop            i_xcnt
+    %assign push_num push_num - 1
+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
+    sub            i_ycnt, 1
+    jnz            .height_loop
+
+.done:
+    mov            r_tmp, p_sadframe
+%ifdef X86_32
+    vmovdqa        xmm2, sadframe_acc
+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+    vextracti128   xmm2, sadframe_acc, 1
+    vpaddd         xmm2, xsadframe_acc, xmm2
+%endif
+    vpunpckhqdq    xmm1, xmm2, xmm2
+    vpaddd         xmm2, xmm2, xmm1
+    vmovd          [r_tmp], xmm2
+    vzeroupper
+%ifdef X86_32
+    STACK_DEALLOC
+%endif
+    POPM           saveregs
+    POP_XMM
+    LOAD_5_PARA_POP
+%undef           p_cur
+%undef           p_ref
+%undef           i_xcnt
+%undef           i_ycnt
+%undef           i_stride
+%undef           i_stride3
+%undef           i_stride5
+%undef           r_tmp
+%undef           xcnt_unit
+%undef           sadframe_acc
+%undef           sadframe_acc_addr
+%undef           xsadframe_acc
+%undef           mm_zero
+%undef           saveregs
+%undef           p_sadframe
+%undef           p_sad8x8
+%undef           p_sum16x16
+%undef           p_sqsum16x16
+%undef           p_sqdiff16x16
+    ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadBgd_8Lines 9
+%define mm_tmp0    %2
+%define mm_tmp1    %3
+%define mm_tmp2    %8
+%define mm_mad     %4
+%define mm_sumcur  %5
+%define mm_sumref  %6
+%define mm_sad     %7
+%define b_second_blocks %9
+    ; Unroll for better performance on Haswell.
+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+    lea            r_tmp, [5 * i_stride]
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    add            p_cur, %1 %+ mm_width
+    add            p_ref, %1 %+ mm_width
+%else
+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+    vpxor          x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur
+    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+    lea            r_tmp, [8 * i_stride]
+    add            p_cur, r_tmp
+    add            p_ref, r_tmp
+    neg            r_tmp
+%%loop:
+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+    add            r_tmp, i_stride
+    jl             %%loop
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+%endif
+    mov            r_tmp, p_sad8x8
+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+    vmovdqa        sadframe_acc, y %+ mm_tmp1
+%else
+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+    mov            r_tmp, p_sd8x8
+    vpsubd         %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref
+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks
+    ; Coalesce store and horizontal reduction of MAD accumulator for even and
+    ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+    test           i_xcnt, 32 / xcnt_unit
+    jz             %%preserve_mad
+    mov            r_tmp, p_mad8x8
+    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+    vmovdqa        prev_mad, y %+ mm_mad
+%else
+    mov            r_tmp, p_mad8x8
+    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_mad
+%undef mm_sumcur
+%undef mm_sumref
+%undef mm_sad
+%undef b_second_blocks
+%endmacro
+
+; Store remaining MAD accumulator for width & 32 cases.
+; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4
+%macro AVX2_StoreRemainingSingleMad 4
+    test           %1, 32 / xcnt_unit
+    jz             %%skip
+    mov            r_tmp, p_mad8x8
+    vmovdqa        y%2, prev_mad
+    AVX2_Maxubq    y%2, y%2, y%3
+    AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4
+%%skip:
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                        int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadBgd_avx2
+%define          p_sadframe                    arg6
+%define          p_sad8x8                      arg7
+%define          p_sd8x8                       arg8
+%define          p_mad8x8                      arg9
+%ifdef X86_32
+%define          saveregs                      r5, r6
+%else
+%define          saveregs                      rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 10
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    PUSHM          saveregs
+
+%define mm_zero mm0
+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+    STACK_ALLOC    r5, 2 * ymm_width, ymm_width
+    %define sadframe_acc_addr r5
+    %define sadframe_acc [sadframe_acc_addr]
+    %define prev_mad [r5 + ymm_width]
+%else
+    %define sadframe_acc ymm8
+    %define xsadframe_acc xmm8
+    %define prev_mad ymm9
+%endif
+    vmovdqa        sadframe_acc, y %+ mm_zero
+
+    and            r2, -16                     ; iPicWidth &= -16
+    jle            .done                       ; bail if iPicWidth < 16
+    sar            r3, 4                       ; iPicHeight / 16
+    jle            .done                       ; bail if iPicHeight < 16
+    shr            r2, 2                       ; iPicWidth / 4
+
+%define p_cur     r0
+%define p_ref     r1
+%define i_xcnt    r2
+%define i_ycnt    ptrword arg4
+%define i_stride  r4
+%define r_tmp     r6
+%define xcnt_unit 4
+%ifdef X86_32
+    mov            i_ycnt, r3
+    %define i_stride3 r3
+%else
+    mov            rbp, p_sad8x8
+    mov            r12, p_sd8x8
+    mov            r13, p_mad8x8
+    %undef  p_sad8x8
+    %undef  p_sd8x8
+    %undef  p_mad8x8
+    %define p_sad8x8 rbp
+    %define p_sd8x8 r12
+    %define p_mad8x8 r13
+    %define i_stride3 rbx
+%endif
+    lea            i_stride3, [3 * i_stride]
+
+    ; offset pointers to compensate for the i_xcnt offset below.
+    mov            r_tmp, i_xcnt
+    and            r_tmp, 64 / xcnt_unit - 1
+    sub            p_mad8x8, r_tmp
+    shl            r_tmp, 2
+    sub            p_sad8x8, r_tmp
+    sub            p_sd8x8, r_tmp
+
+.height_loop:
+    push           i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+    ; use end-of-line pointers so as to enable use of a negative counter as index.
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    add            p_sad8x8, r_tmp
+    add            p_sd8x8, r_tmp
+    add            p_mad8x8, i_xcnt
+    and            i_xcnt, -(64 / xcnt_unit)
+    jz             .width_loop_upper8_64x_end
+    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+    neg            i_xcnt
+.width_loop_upper8:
+    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_upper8
+    jg             .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+    test           i_xcnt_load, 32 / xcnt_unit
+    jnz            .width_loop_upper8
+.width_loop_upper8_32x_end:
+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+    test           i_xcnt_load, 16 / xcnt_unit
+    jz             .width_loop_upper8_end
+    ; remaining 16.
+    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    mov            i_xcnt, i_xcnt_load
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+    and            i_xcnt, -(64 / xcnt_unit)
+    jz             .width_loop_lower8_64x_end
+    neg            i_xcnt
+.width_loop_lower8:
+    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_lower8
+    jg             .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+    test           i_xcnt_load, 32 / xcnt_unit
+    jnz            .width_loop_lower8
+.width_loop_lower8_32x_end:
+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+    test           i_xcnt_load, 16 / xcnt_unit
+    jz             .width_loop_lower8_end
+    ; remaining 16.
+    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    pop            i_xcnt
+%undef i_xcnt_load
+    %assign push_num push_num - 1
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+    sub            i_ycnt, 1
+    jnz            .height_loop
+
+.done:
+    mov            r_tmp, p_sadframe
+%ifdef X86_32
+    vmovdqa        xmm2, sadframe_acc
+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+    vextracti128   xmm2, sadframe_acc, 1
+    vpaddd         xmm2, xsadframe_acc, xmm2
+%endif
+    vpunpckhqdq    xmm1, xmm2, xmm2
+    vpaddd         xmm2, xmm2, xmm1
+    vmovd          [r_tmp], xmm2
+    vzeroupper
+%ifdef X86_32
+    STACK_DEALLOC
+%endif
+    POPM           saveregs
+    POP_XMM
+    LOAD_5_PARA_POP
+%undef           p_cur
+%undef           p_ref
+%undef           i_xcnt
+%undef           i_ycnt
+%undef           i_stride
+%undef           i_stride3
+%undef           r_tmp
+%undef           xcnt_unit
+%undef           sadframe_acc
+%undef           sadframe_acc_addr
+%undef           xsadframe_acc
+%undef           prev_mad
+%undef           mm_zero
+%undef           saveregs
+%undef           p_sadframe
+%undef           p_sad8x8
+%undef           p_sd8x8
+%undef           p_mad8x8
+    ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11
+%macro AVX2_CalcSadSsdBgd_8Lines 11
+%define mm_tmp0    %2
+%define mm_tmp1    %3
+%define mm_sad     %4
+%define mm_sum     %5
+%define mm_sumref  %6
+%define mm_mad     %7
+%define mm_sqsum   %8
+%define mm_sqdiff  %9
+%ifidn %10, 0
+%define tmp2       0
+%else
+%define tmp2       %1 %+ %10
+%endif
+%define b_second_blocks %11
+    ; Unroll for better performance on Haswell.
+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+    lea            r_tmp, [5 * i_stride]
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    lea            r_tmp, [i_stride + 2 * i_stride3]
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    add            p_cur, %1 %+ mm_width
+    add            p_ref, %1 %+ mm_width
+%else
+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+    lea            r_tmp, [8 * i_stride]
+    add            p_cur, r_tmp
+    add            p_ref, r_tmp
+    neg            r_tmp
+%%loop:
+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+    add            r_tmp, i_stride
+    jl             %%loop
+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+%endif
+    mov            r_tmp, p_sad8x8
+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+    vmovdqa        sadframe_acc, y %+ mm_tmp1
+%else
+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+    mov            r_tmp, i_xcnt
+    add            r_tmp, p_sum16x16
+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+    mov            r_tmp, p_sd8x8
+    vpsubd         %1 %+ mm_sum,  %1 %+ mm_sum, %1 %+ mm_sumref
+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks
+    ; Coalesce store and horizontal reduction of MAD accumulator for even and
+    ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+    test           i_xcnt, 32 / xcnt_unit
+    jz             %%preserve_mad
+    mov            r_tmp, p_mad8x8
+    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+    vmovdqa        prev_mad, y %+ mm_mad
+%else
+    mov            r_tmp, p_mad8x8
+    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1
+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0,  10110001b
+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1
+    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef mm_mad
+%undef mm_sum
+%undef mm_sumref
+%undef mm_sad
+%undef tmp2
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsdBgd_avx2
+%define         p_sadframe                      arg6
+%define         p_sad8x8                        arg7
+%define         p_sum16x16                      arg8
+%define         p_sqsum16x16                    arg9
+%define         p_sqdiff16x16                   arg10
+%define         p_sd8x8                         arg11
+%define         p_mad8x8                        arg12
+%ifdef X86_32
+%define         saveregs                        r5, r6
+%else
+%define         saveregs                        rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 12
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    PUSHM          saveregs
+
+%ifdef X86_32
+    STACK_ALLOC    r5, 3 * ymm_width, ymm_width
+    %define mm8 0
+    %define sadframe_acc_addr r5
+    %define sadframe_acc [sadframe_acc_addr]
+    %define prev_mad [r5 + ymm_width]
+    %define ymm_zero [r5 + 2 * ymm_width]
+    %define xmm_zero ymm_zero
+    vpxor          xmm0, xmm0, xmm0
+    vmovdqa        sadframe_acc, ymm0
+    vmovdqa        ymm_zero, ymm0
+%else
+    %define sadframe_acc ymm9
+    %define xsadframe_acc xmm9
+    %define prev_mad ymm10
+    %define ymm_zero ymm11
+    %define xmm_zero xmm11
+    vpxor          xmm_zero, xmm_zero, xmm_zero
+    vpxor          xsadframe_acc, xsadframe_acc, xsadframe_acc
+%endif
+
+    and            r2, -16                     ; iPicWidth &= -16
+    jle            .done                       ; bail if iPicWidth < 16
+    sar            r3, 4                       ; iPicHeight / 16
+    jle            .done                       ; bail if iPicHeight < 16
+    shr            r2, 2                       ; iPicWidth / 4
+
+%define p_cur     r0
+%define p_ref     r1
+%define i_xcnt    r2
+%define i_ycnt    ptrword arg4
+%define i_stride  r4
+%define r_tmp     r6
+%define xcnt_unit 4
+%ifdef X86_32
+    mov            i_ycnt, r3
+    %define i_stride3 r3
+%else
+    mov            rbp, p_sad8x8
+    mov            r12, p_sum16x16
+    mov            r13, p_sqsum16x16
+    mov            r14, p_sqdiff16x16
+    mov            r15, p_sd8x8
+    %undef p_sad8x8
+    %undef p_sum16x16
+    %undef p_sqsum16x16
+    %undef p_sqdiff16x16
+    %undef p_sd8x8
+    %define p_sad8x8 rbp
+    %define p_sum16x16 r12
+    %define p_sqsum16x16 r13
+    %define p_sqdiff16x16 r14
+    %define p_sd8x8 r15
+    %define i_stride3 rbx
+%endif
+    lea            i_stride3, [3 * i_stride]
+
+    ; offset pointers so as to compensate for the i_xcnt offset below.
+    mov            r_tmp, i_xcnt
+    and            r_tmp, 64 / xcnt_unit - 1
+    sub            p_sum16x16, r_tmp
+    sub            p_sqsum16x16, r_tmp
+    sub            p_sqdiff16x16, r_tmp
+    sub            p_mad8x8, r_tmp
+    shl            r_tmp, 2
+    sub            p_sad8x8, r_tmp
+    sub            p_sd8x8, r_tmp
+
+.height_loop:
+    push           i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+    ; use end-of-line pointers so as to enable use of a negative counter as index.
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    add            p_sad8x8, r_tmp
+    add            p_sum16x16, i_xcnt
+    add            p_sqsum16x16, i_xcnt
+    add            p_sqdiff16x16, i_xcnt
+    add            p_sd8x8, r_tmp
+    add            p_mad8x8, i_xcnt
+    and            i_xcnt, -(64 / xcnt_unit)
+    jz             .width_loop_upper8_64x_end
+    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+    neg            i_xcnt
+.width_loop_upper8:
+    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_upper8
+    jg             .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+    test           i_xcnt_load, 32 / xcnt_unit
+    jnz            .width_loop_upper8
+.width_loop_upper8_32x_end:
+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+    test           i_xcnt_load, 16 / xcnt_unit
+    jz             .width_loop_upper8_end
+    ; remaining 16.
+    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+.width_loop_upper8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    mov            i_xcnt, i_xcnt_load
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+    and            i_xcnt, -(64 / xcnt_unit)
+    jz             .width_loop_lower8_64x_end
+    neg            i_xcnt
+.width_loop_lower8:
+    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+    add            i_xcnt, 32 / xcnt_unit
+    jl             .width_loop_lower8
+    jg             .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+    test           i_xcnt_load, 32 / xcnt_unit
+    jnz            .width_loop_lower8
+.width_loop_lower8_32x_end:
+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+    test           i_xcnt_load, 16 / xcnt_unit
+    jz             .width_loop_lower8_end
+    ; remaining 16.
+    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+.width_loop_lower8_end:
+    lea            p_cur, [p_cur + 8 * i_stride]
+    lea            p_ref, [p_ref + 8 * i_stride]
+    pop            i_xcnt
+%undef i_xcnt_load
+    %assign push_num push_num - 1
+    lea            r_tmp, [xcnt_unit * i_xcnt]
+    sub            p_cur, r_tmp
+    sub            p_ref, r_tmp
+    sub            i_ycnt, 1
+    jnz            .height_loop
+
+.done:
+    mov            r_tmp, p_sadframe
+%ifdef X86_32
+    vmovdqa        xmm2, sadframe_acc
+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+    vextracti128   xmm2, sadframe_acc, 1
+    vpaddd         xmm2, xsadframe_acc, xmm2
+%endif
+    vpunpckhqdq    xmm1, xmm2, xmm2
+    vpaddd         xmm2, xmm2, xmm1
+    vmovd          [r_tmp], xmm2
+    vzeroupper
+%ifdef X86_32
+    STACK_DEALLOC
+%endif
+    POPM           saveregs
+    POP_XMM
+    LOAD_5_PARA_POP
+%undef           p_cur
+%undef           p_ref
+%undef           i_xcnt
+%undef           i_ycnt
+%undef           i_stride
+%undef           i_stride3
+%undef           r_tmp
+%undef           xcnt_unit
+%undef           mm8
+%undef           sadframe_acc
+%undef           sadframe_acc_addr
+%undef           xsadframe_acc
+%undef           prev_mad
+%undef           ymm_zero
+%undef           xmm_zero
+%undef           saveregs
+%undef           p_sadframe
+%undef           p_sad8x8
+%undef           p_sum16x16
+%undef           p_sqsum16x16
+%undef           p_sqdiff16x16
+%undef           p_sd8x8
+%undef           p_mad8x8
+    ret
--- a/test/processing/ProcessUT_VaaCalc.cpp
+++ b/test/processing/ProcessUT_VaaCalc.cpp
@@ -828,6 +828,12 @@
 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)
 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)
 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)
+
+GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)
 #endif
 
 #if defined(HAVE_NEON)