shithub: openh264

Download patch

ref: 7486de28442fae3cb6c7a5fe87a30c9e2a80cac5
parent: 1030820ec472b944126f022cd9170b3b17515969
author: Sindre Aamås <saamas@cisco.com>
date: Wed Jan 27 08:04:44 EST 2016

[Encoder] AVX2 DCT tweaks

Do some shuffling in load/store unpack/pack to save some
work in horizontal DCTs.

Use a few 128-bit broadcasts to compact data vectors a bit.

~1.04x speedup for the DCT case on Haswell.
~1.12x speedup for the IDCT case on Haswell.

--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -49,14 +49,18 @@
 ;***********************************************************************
 
 align 32
+wels_shufb0312_movzxw_128:
+    db 0, 80h, 3, 80h, 1, 80h, 2, 80h, 4, 80h, 7, 80h, 5, 80h, 6, 80h
+wels_shufb2301_128:
+    db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+wels_shufb0231_128:
+    db 0, 2, 3, 1, 4, 6, 7, 5, 8, 10, 11, 9, 12, 14, 15, 13
+wels_dw32_128:
+    times 8 dw 32
 wels_p1m1p1m1w_256:
     times 8 dw 1, -1
-wels_p1p2p1p2w_256:
-    times 8 dw 1, 2
-wels_rev64w_256:
-    times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
-wels_p1m1m1p1w_256:
-    times 4 dw 1, -1, -1, 1
+wels_p1p2m1m2w_256:
+    times 4 dw 1, 2, -1, -2
 wels_p1p1m1m1w_256:
     times 4 dw 1, 1, -1, -1
 
@@ -572,20 +576,20 @@
 ; AVX2 functions
 ;***********************************************************************
 
-; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 zero=%6 clobber=%7,%8
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
 %macro AVX2_LoadDiff16P 8
     vmovq         x%1, [%2         ]
     vpbroadcastq  y%7, [%2 + 4 * %3]
     vpblendd      y%1, y%1, y%7, 11110000b
-    vpunpcklbw    y%1, y%1, y%6
+    vpshufb       y%1, y%1, y%6
     vmovq         x%7, [%4         ]
     vpbroadcastq  y%8, [%4 + 4 * %5]
     vpblendd      y%7, y%7, y%8, 11110000b
-    vpunpcklbw    y%7, y%7, y%6
+    vpshufb       y%7, y%7, y%6
     vpsubw        y%1, y%1, y%7
 %endmacro
 
-; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 zero=%8 clobber=%9,%10
+; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 wels_shufb0312_movzxw=%8 clobber=%9,%10
 %macro AVX2_StoreDiff32P 10
     vpaddw        y%3, y%3, y%7
     vpsraw        y%3, y%3, 6
@@ -593,7 +597,7 @@
     vpbroadcastq  y%10, [%5 + 4 * %6]
     add           %5, %6
     vpblendd      y%9, y%9, y%10, 11110000b
-    vpunpcklbw    y%9, y%9, y%8
+    vpshufb       y%9, y%9, y%8
     vpaddsw       y%3, y%3, y%9
     vpaddw        y%4, y%4, y%7
     vpsraw        y%4, y%4, 6
@@ -600,9 +604,11 @@
     vmovq         x%9,  [%5         ]
     vpbroadcastq  y%10, [%5 + 4 * %6]
     vpblendd      y%9, y%9, y%10, 11110000b
-    vpunpcklbw    y%9, y%9, y%8
+    vpshufb       y%9, y%9, y%8
     vpaddsw       y%4, y%4, y%9
     vpackuswb     y%3, y%3, y%4
+    vbroadcasti128 y%4, [wels_shufb0231_128]
+    vpshufb       y%3, y%3, y%4
     vextracti128  x%4, y%3, 1
     vmovlps       [%1         ], x%3
     vmovlps       [%1 + 4 * %2], x%4
@@ -674,29 +680,29 @@
 %endmacro
 
 ; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
-; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+; Uses scrambled input to save a negation.
+; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
 %macro AVX2_DCT_HORIZONTAL 3
-    vpsignw       %3, %1, [wels_p1m1p1m1w_256]  ; [x[0],-x[1],x[2],-x[3], ...]
-    vpshufb       %1, %1, %2                    ; [x[3],x[2],x[1],x[0], ...]
-    vpaddw        %1, %1, %3                    ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
-    vpmullw       %3, %1, [wels_p1m1m1p1w_256]  ; [s[0],-s[1],-s[2],s[3], ...]
+    vpsignw       %3, %1, [wels_p1m1p1m1w_256]  ; [x0,-x3,x1,-x2]
+    vpshufb       %1, %1, %2                    ; [x3,x0,x2,x1]
+    vpaddw        %1, %1, %3                    ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+    vpmullw       %3, %1, [wels_p1p2m1m2w_256]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
     vpshufd       %1, %1, 0b1h                  ; [s[2],s[3],s[0],s[1], ...]
-    vpmullw       %1, %1, [wels_p1p2p1p2w_256]  ; [s[2],2*s[3],s[0],2*s[1], ...]
-    vpaddw        %1, %1, %3                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+    vpaddw        %1, %1, %3                    ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
 %endmacro
 
 ; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
-; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+; Output is scrambled to save a negation.
+; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 wels_shufb2301=%2 clobber=%3
 %macro AVX2_IDCT_HORIZONTAL 3
-    vpsraw        %3, %1, 1                     ; [x[0]>>1,x[1]>>1,x[2]>>1,x[3]>>1, ...]
-    vpblendw      %3, %1, %3, 10101010b         ; [x[0],x[1]>>1,x[2],x[3]>>1, ...]
-    vpshufd       %1, %1, 0b1h                  ; [x[2],x[3],x[0],x[1], ...]
-    vpsignw       %1, %1, [wels_p1m1m1p1w_256]  ; [x[2],-x[3],-x[0],x[1], ...]
-    vpaddw        %1, %3, %1                    ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
-    vpshufb       %3, %1, %2                    ; [s[3],s[2],s[1],s[0], ...]
-    vpmullw       %1, %1, [wels_p1p1m1m1w_256]  ; [s[0],s[1],-s[2],-s[3], ...]
-    vpmullw       %3, %3, [wels_p1m1m1p1w_256]  ; [s[3],-s[2],-s[1],s[0], ...]
-    vpaddw        %1, %1, %3                    ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
+    vpsraw        %3, %1, 1                     ; [x0>>1,x1>>1,x2>>1,x3>>1]
+    vpblendw      %3, %1, %3, 10101010b         ; [x0,x1>>1,x2,x3>>1]
+    vpsignw       %1, %1, [wels_p1p1m1m1w_256]  ; [x0,x1,-x2,-x3]
+    vpshufd       %3, %3, 0b1h                  ; [x2,x3>>1,x0,x1>>1]
+    vpaddw        %1, %3, %1                    ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
+    vpshufb       %3, %1, %2                    ; [s[1],s[0],s[3],s[2], ...]
+    vpsignw       %1, %1, [wels_p1m1p1m1w_256]  ; [s[0],-s[1],s[2],-s[3], ...]
+    vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
 %endmacro
 
 ;***********************************************************************
@@ -709,7 +715,7 @@
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
 
-    vpxor ymm6, ymm6, ymm6
+    vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
 
     ;Load 4x16
     AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
@@ -724,7 +730,7 @@
     AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
 
     AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
-    vmovdqa ymm6, [wels_rev64w_256]
+    vbroadcasti128 ymm6, [wels_shufb2301_128]
     AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -748,7 +754,7 @@
     SIGN_EXTENSION r3, r3d
 
     AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
-    vmovdqa ymm6, [wels_rev64w_256]
+    vbroadcasti128 ymm6, [wels_shufb2301_128]
     AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -755,8 +761,8 @@
     AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
     AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
 
-    vpxor ymm6, ymm6, ymm6
-    WELS_DW32_VEX ymm7
+    vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
+    vbroadcasti128 ymm7, [wels_dw32_128]
     AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
     add r2, r3
     add r0, r1