ref: c176e6490403076105faa2a07f275d31ec61d2a3
parent: fa0076282e62f649483bde868602aab86448a661
author: Johann <johann.koenig@duck.com>
date: Thu Oct 25 09:37:50 EDT 2018
vpx postproc: rewrite in intrinsics About ~10% faster on 64bit but ~10% slower on 32 Removes the assembly usage of vpx_rv. Change-Id: I214698fb5677f615dee0a8f5f5bb8f64daf2565e
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -69,6 +69,7 @@
DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c
DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c
endif # CONFIG_POSTPROC
--- a/vpx_dsp/x86/deblock_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -232,237 +232,6 @@
ret
%undef flimit
-;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_sse2) PRIVATE
-sym(vpx_mbpost_proc_down_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 128+16
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
- mov [rsp+128+8], eax
- mov [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vpx_rv))]
-%endif
-
- ;rows +=8;
- add dword arg(2), 8
-
- ;for(c=0; c<cols; c+=8)
-.loop_col:
- mov rsi, arg(0) ; s
- pxor xmm0, xmm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
-
- ; this copies the last row down into the border 8 rows
- mov rdi, rsi
- mov rdx, arg(2)
- sub rdx, 9
- imul rdx, rax
- lea rdi, [rdi+rdx]
- movq xmm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_borderd: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], xmm1
-
- dec rcx
- jne .init_borderd
-
- neg rax ; rax = -pitch
-
- ; this copies the first row up into the border 8 rows
- mov rdi, rsi
- movq xmm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_border: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], xmm1
-
- dec rcx
- jne .init_border
-
-
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movq xmm1, QWORD PTR [rdi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
- movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [rsi+rax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vpx_rv))]
- movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
-%else
- movdqu xmm4, [sym(vpx_rv) + rcx*2]
-%endif
-
- paddw xmm1, xmm4
- ;paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and rcx, 15
- movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
-
- cmp edx, 8
- jl .skip_assignment
-
- mov rcx, rdx
- sub rcx, 8
- and rcx, 15
- movq mm0, [rsp + rcx*8] ;d[rcx*8]
- movq [rsi], mm0
-
-.skip_assignment:
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
- add dword arg(0), 8 ; s += 8
- sub dword arg(3), 8 ; cols -= 8
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 128+16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
; int pitch, int rows, int cols,int flimit)
--- /dev/null
+++ b/vpx_dsp/x86/post_proc_sse2.c
@@ -1,0 +1,141 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+ int cols, int flimit) {
+ int col;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i f = _mm_set1_epi32(flimit);
+ DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ for (col = 0; col < cols; col += 8) {
+ int row, i;
+ __m128i s = _mm_loadl_epi64((__m128i *)dst);
+ __m128i sum, sumsq_0, sumsq_1;
+ __m128i tmp_0, tmp_1;
+ __m128i below_context;
+
+ s = _mm_unpacklo_epi8(s, zero);
+
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)above_context + i, s);
+ }
+
+ // sum *= 9
+ sum = _mm_slli_epi16(s, 3);
+ sum = _mm_add_epi16(s, sum);
+
+ // sum^2 * 9 == (sum * 9) * sum
+ tmp_0 = _mm_mullo_epi16(sum, s);
+ tmp_1 = _mm_mulhi_epi16(sum, s);
+
+ sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+ sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+ // Prime sum/sumsq
+ for (i = 1; i <= 6; ++i) {
+ __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+ a = _mm_unpacklo_epi8(a, zero);
+ sum = _mm_add_epi16(sum, a);
+ a = _mm_mullo_epi16(a, a);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+ }
+
+ for (row = 0; row < rows + 8; row++) {
+ const __m128i above =
+ _mm_load_si128((__m128i *)above_context + (row & 7));
+ __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+ __m128i above_sq, below_sq;
+ __m128i mask_0, mask_1;
+ __m128i multmp_0, multmp_1;
+ __m128i rv;
+ __m128i out;
+
+ this_row = _mm_unpacklo_epi8(this_row, zero);
+
+ if (row + 7 < rows) {
+ // Instead of copying the end context we just stop loading when we get
+ // to the last one.
+ below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+ below_context = _mm_unpacklo_epi8(below_context, zero);
+ }
+
+ sum = _mm_sub_epi16(sum, above);
+ sum = _mm_add_epi16(sum, below_context);
+
+ // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+ // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+ // because x86 does not have unpack with sign extension.
+ above_sq = _mm_mullo_epi16(above, above);
+ sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+ sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+ below_sq = _mm_mullo_epi16(below_context, below_context);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+ // sumsq * 16 - sumsq == sumsq * 15
+ mask_0 = _mm_slli_epi32(sumsq_0, 4);
+ mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+ mask_1 = _mm_slli_epi32(sumsq_1, 4);
+ mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+ multmp_0 = _mm_mullo_epi16(sum, sum);
+ multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+ mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+ mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+ // mask - f gives a negative value when mask < f
+ mask_0 = _mm_sub_epi32(mask_0, f);
+ mask_1 = _mm_sub_epi32(mask_1, f);
+
+ // Shift the sign bit down to create a mask
+ mask_0 = _mm_srai_epi32(mask_0, 31);
+ mask_1 = _mm_srai_epi32(mask_1, 31);
+
+ mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+ rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+ mask_1 = _mm_add_epi16(rv, sum);
+ mask_1 = _mm_add_epi16(mask_1, this_row);
+ mask_1 = _mm_srai_epi16(mask_1, 4);
+
+ mask_1 = _mm_and_si128(mask_0, mask_1);
+ mask_0 = _mm_andnot_si128(mask_0, this_row);
+ out = _mm_or_si128(mask_1, mask_0);
+
+ _mm_storel_epi64((__m128i *)(dst + row * pitch),
+ _mm_packus_epi16(out, zero));
+
+ _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+ }
+
+ dst += 8;
+ }
+}