ref: f76daa92ad831c5ca179d2ec16ac9d7996874b51
parent: 36ee29037b3ebc36e848561392c05bbf2cb4cab8
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:24:50 EST 2017
[Encoder/x86] Simplify and extend score X86_32_PICASM handling Utilize program counter-relative offsets to simplify X86_32_PICASM code. In order for this to work with nasm, data constants are placed in the text segment. Extend support for X86_32_PICASM to all routines and enable disabled routines.
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,9 +75,7 @@
#ifdef X86_ASM
-#ifndef X86_32_PICASM
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
-#endif
int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
/****************************************************************************
@@ -86,9 +84,7 @@
void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
-#ifndef X86_32_PICASM
int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
-#endif
/****************************************************************************
* DCT functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,9 +500,7 @@
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
-#ifndef X86_32_PICASM
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2;
-#endif
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2;
@@ -516,9 +514,7 @@
pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
-#ifndef X86_32_PICASM
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
-#endif
pFuncList->pfDctT4 = WelsDctT4_sse2;
pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -49,7 +49,11 @@
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
;align 16
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
@@ -200,6 +204,7 @@
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
@@ -207,29 +212,12 @@
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x0d0c0706 ;pb_scanacdc_maska
- push 0x05040b0a
- push 0x0f0e0908
- push 0x03020100
- push 0x0f0e0d0c ;pb_scanacdc_maskb
- push 0x07060100
- push 0x05040b0a
- push 0x09080302
- pshufb xmm1, [esp]
- pshufb xmm0, [esp+16]
- mov esp, r0
- pop r0
-%else
- pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
-%endif
+ pshufb xmm1, [pic(pb_scanacdc_maskb)]
+ pshufb xmm0, [pic(pb_scanacdc_maska)]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -268,7 +256,6 @@
ret
-%ifndef X86_32_PICASM
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
@@ -279,6 +266,7 @@
%else
%assign push_num 0
%endif
+ INIT_X86_32_PIC r4
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
@@ -309,16 +297,17 @@
.find1end:
sub r1, r2
sub r1, 1
- lea r2, [i_ds_table]
+ lea r2, [pic(i_ds_table)]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
- lea r2 , [low_mask_table]
+ lea r2 , [pic(low_mask_table)]
add r0b, [r2 +r3]
- lea r2, [high_mask_table]
+ lea r2, [pic(high_mask_table)]
add r0b, [r2+r1]
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r3
%else
@@ -325,15 +314,14 @@
mov retrd, r0d
%endif
ret
-%endif ;ifndef X86_32_PICASM
-%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
@@ -350,14 +338,14 @@
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
- lea r0 , [nozero_count_table]
+ lea r0 , [pic(nozero_count_table)]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
+ DEINIT_X86_32_PIC
ret
-%endif ;%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,7 +222,6 @@
}
#endif //HAVE_AVX2
-#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -236,7 +235,6 @@
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
-#endif //#ifndef X86_32_PICASM
#endif
void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -304,11 +302,9 @@
TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
}
#ifdef X86_ASM
-#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
}
-#endif
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);