shithub: openh264

Download patch

ref: ac08cc4b2fcd087c9d56bd89a8092d582a9433fa
parent: 9a89ee72b0bfb0f0202d40466c15a596caace543
parent: 01b74ea7c1b8deba1fe2f8ecbc3a4d4e9afeab48
author: huili2 <huili2@cisco.com>
date: Tue Jan 6 08:37:56 EST 2015

Merge pull request #1694 from zhilwang/asm-SetNoneZero

Add asm code for NoneZeroCount and refine related code

--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -834,17 +834,12 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
-
-    vld1.64 {d0-d2}, [r0]
-
-    vceq.s8 q0, q0, #0
-    vceq.s8 d2, d2, #0
-    vmvn    q0, q0
-    vmvn    d2, d2
-    vabs.s8 q0, q0
-    vabs.s8 d2, d2
-
-    vst1.64 {d0-d2}, [r0]
+    mov       r1, #1
+    vdup.8    q2, r1
+    vld1.64   {d0,d1,d2}, [r0]
+    vmin.s8   q0, q0, q2
+    vmin.s8   d2, d2, d4
+    vst1.64   {d0,d1,d2}, [r0]
 WELS_ASM_FUNC_END
 
 #ifdef __APPLE__
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -553,16 +553,12 @@
 #endif
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
+    mov w1, #1
+    dup v3.8b, w1
     ld1 {v0.8b, v1.8b, v2.8b}, [x0]
-    ins v0.d[1], v1.d[0]
-    uzp1 v0.2d, v0.2d, v1.2d
-    cmeq v0.16b, v0.16b, #0
-    cmeq v2.8b, v2.8b, #0
-    mvn v0.16b, v0.16b
-    mvn v2.8b, v2.8b
-    abs v0.16b, v0.16b
-    abs v2.8b, v2.8b
-    ins v1.d[0], v0.d[1]
+    umin  v0.8b, v0.8b, v3.8b
+    umin  v1.8b, v1.8b, v3.8b
+    umin  v2.8b, v2.8b, v3.8b
     st1 {v0.8b, v1.8b, v2.8b}, [x0]
 WELS_ASM_AARCH64_FUNC_END
 
--- a/codec/common/inc/deblocking_common.h
+++ b/codec/common/inc/deblocking_common.h
@@ -15,6 +15,8 @@
                           int8_t* pTc);
 void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
+void WelsNonZeroCount_c (int8_t* pNonZeroCount);
+
 #if defined(__cplusplus)
 extern "C" {
 #endif//__cplusplus
@@ -32,6 +34,7 @@
 void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                               int8_t* pTC);
+void WelsNonZeroCount_sse2 (int8_t* pNonZeroCount);
 #endif
 
 #if defined(HAVE_NEON)
@@ -48,6 +51,7 @@
 void DeblockChromaLt4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                              int8_t* pTC);
 void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
 #endif
 
 #if defined(HAVE_NEON_AARCH64)
@@ -61,6 +65,7 @@
 void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                                      int8_t* pTC);
 void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
 #endif
 #if defined(__cplusplus)
 }
--- a/codec/common/src/deblocking_common.cpp
+++ b/codec/common/src/deblocking_common.cpp
@@ -180,6 +180,13 @@
   DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
 }
 
+void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
+  int32_t i;
+  for (i = 0; i < 24; i++) {
+    pNonZeroCount[i] = !!pNonZeroCount[i];
+  }
+}
+
 #ifdef X86_ASM
 extern "C" {
   void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -5276,3 +5276,14 @@
     pop      r3
     ret
 
+WELS_EXTERN WelsNonZeroCount_sse2
+    %assign  push_num 0
+    LOAD_1_PARA
+    movdqu  xmm0, [r0]
+    movq    xmm1, [r0+16]
+    WELS_DB1 xmm2
+    pminub  xmm0, xmm2
+    pminub  xmm1, xmm2
+    movdqu  [r0], xmm0
+    movq    [r0+16], xmm1
+    ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -99,18 +99,6 @@
 //  }
 .endm
 #endif
-// r0    int8_t* non_zero_count,
-WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
-    vld1.64 {d0-d2}, [r0]
-    vceq.s8 q0, q0, #0
-    vceq.s8 d2, d2, #0
-    vmvn    q0, q0
-    vmvn    d2, d2
-    vabs.s8 q0, q0
-    vabs.s8 d2, d2
-    vst1.64 {d0-d2}, [r0]
-WELS_ASM_FUNC_END
-
 
 //  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -100,20 +100,6 @@
 //  }
 .endm
 #endif
-// x0    int8_t* non_zero_count,
-WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon
-    mov x1, x0
-    ld1 {v0.16b}, [x1], #16
-    ld1 {v1.8b}, [x1]
-    cmeq v0.16b, v0.16b, #0
-    cmeq v1.8b, v1.8b, #0
-    mvn  v0.16b, v0.16b
-    mvn  v1.8b, v1.8b
-    abs  v0.16b, v0.16b
-    abs  v1.8b, v1.8b
-    st1 {v0.16b}, [x0], #16
-    st1 {v1.8b}, [x0]
-WELS_ASM_AARCH64_FUNC_END
 
 //  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -73,13 +73,11 @@
 #endif
 
 #if defined(HAVE_NEON)
-void SetNonZeroCount_neon (int8_t* pNonZeroCount);
 void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
 void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
 #endif
 
 #if defined(HAVE_NEON_AARCH64)
-void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
 void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
 void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
 #endif
@@ -86,8 +84,6 @@
 #ifdef __cplusplus
 }
 #endif//__cplusplus
-
-void SetNonZeroCount_c (int8_t* pNonZeroCount);
 
 void WelsBlockFuncInit (SBlockFunc* pFunc,  int32_t iCpu);
 void WelsBlockZero16x16_c(int16_t * block, int32_t stride);
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1695,25 +1695,13 @@
 }
 
 void WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
-  pFunc->pWelsSetNonZeroCountFunc	    = SetNonZeroCount_c;
+  pFunc->pWelsSetNonZeroCountFunc	    = WelsNonZeroCount_c;
+  pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_c;
+  pFunc->pWelsBlockZero8x8Func          = WelsBlockZero8x8_c;
 
 #ifdef	HAVE_NEON
   if (iCpu & WELS_CPU_NEON) {
-    pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_neon;
-  }
-#endif
-
-#ifdef	HAVE_NEON_AARCH64
-  if (iCpu & WELS_CPU_NEON) {
-    pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_AArch64_neon;
-  }
-#endif
-
-  pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_c;
-  pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_c;
-  //TO DO add neon and X86
-#ifdef	HAVE_NEON
-  if (iCpu & WELS_CPU_NEON) {
+    pFunc->pWelsSetNonZeroCountFunc		= WelsNonZeroCount_neon;
     pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_neon;
     pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_neon;
   }
@@ -1721,6 +1709,7 @@
 
 #ifdef	HAVE_NEON_AARCH64
   if (iCpu & WELS_CPU_NEON) {
+    pFunc->pWelsSetNonZeroCountFunc		= WelsNonZeroCount_AArch64_neon;
     pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_AArch64_neon;
     pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_AArch64_neon;
   }
@@ -1728,19 +1717,12 @@
 
 #if defined(X86_ASM)
   if (iCpu & WELS_CPU_SSE2) {
+    pFunc->pWelsSetNonZeroCountFunc		= WelsNonZeroCount_sse2;
     pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_sse2;
     pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_sse2;
   }
 #endif
 
-}
-
-void SetNonZeroCount_c (int8_t* pNonZeroCount) {
-  int32_t i;
-
-  for (i = 0; i < 24; i++) {
-    pNonZeroCount[i] = !!pNonZeroCount[i];
-  }
 }
 
 void WelsBlockInit (int16_t* pBlock, int iW, int iH, int iStride, uint8_t uiVal) {
--- a/codec/encoder/core/inc/deblocking.h
+++ b/codec/encoder/core/inc/deblocking.h
@@ -65,12 +65,10 @@
 extern "C" {
 #endif//__cplusplus
 #if defined(HAVE_NEON)
-void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
 void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
                                uint8_t (*pBS)[4][4]);
 #endif
 #if defined(HAVE_NEON_AARCH64)
-void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
 void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
                                        uint8_t (*pBS)[4][4]);
 #endif
@@ -79,7 +77,6 @@
 #endif//__cplusplus
 void DeblockingInit (DeblockingFunc*   pFunc,  int32_t iCpu);
 
-void WelsNonZeroCount_c (int8_t* pNonZeroCount);
 void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero,  int32_t iCpu);
 
 void PerformDeblockingFilter (sWelsEncCtx* pEnc);
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -774,13 +774,6 @@
   }
 }
 
-void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
-  int32_t i;
-
-  for (i = 0; i < 24; i++) {
-    pNonZeroCount[i] = !!pNonZeroCount[i];
-  }
-}
 void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero,  int32_t iCpu) {
   *pfSetNZCZero = WelsNonZeroCount_c;
 #ifdef	HAVE_NEON
@@ -791,6 +784,11 @@
 #ifdef	HAVE_NEON_AARCH64
   if (iCpu & WELS_CPU_NEON) {
     *pfSetNZCZero = WelsNonZeroCount_AArch64_neon;
+  }
+#endif
+#if defined(X86_ASM)
+  if (iCpu & WELS_CPU_SSE2) {
+    *pfSetNZCZero = WelsNonZeroCount_sse2;
   }
 #endif
 }
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -1,7 +1,7 @@
 #include <gtest/gtest.h>
 #include "macros.h"
 #include "decode_mb_aux.h"
-#include "../../codec/decoder/core/src/decode_slice.cpp"
+#include "deblocking.h"
 using namespace WelsDec;
 void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
   int16_t iSrc[16];
@@ -98,7 +98,7 @@
 {\
     int8_t iNonZeroCount[2][24];\
     for(int32_t i = 0; i < 24; i++) {\
-        iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\
+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 25);\
     }\
     method(iNonZeroCount[0]);\
     SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -106,7 +106,7 @@
         ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
     }\
     for(int32_t i =0; i<24; i++) {\
-        iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\
+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = 0;\
     }\
     method(iNonZeroCount[0]);\
     SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -114,7 +114,7 @@
         ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
     }\
     for(int32_t i =0; i<24; i++) {\
-        iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\
+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = 16;\
     }\
     method(iNonZeroCount[0]);\
     SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -123,12 +123,16 @@
     }\
 }
 
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_c)
 
+#if defined(X86_ASM)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_sse2)
+#endif
+
 #if defined(HAVE_NEON)
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_neon)
 #endif
 
 #if defined(HAVE_NEON_AARCH64)
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_AArch64_neon)
 #endif