ref: 7a5596fa78ad41c428c2e3a4196d3b391d4ac77c
dir: /vpx_dsp/x86/deblock_sse2.asm/
; ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" ;macro in deblock functions %macro FIRST_2_ROWS 0 movdqa xmm4, xmm0 movdqa xmm6, xmm0 movdqa xmm5, xmm1 pavgb xmm5, xmm3 ;calculate absolute value psubusb xmm4, xmm1 psubusb xmm1, xmm0 psubusb xmm6, xmm3 psubusb xmm3, xmm0 paddusb xmm4, xmm1 paddusb xmm6, xmm3 ;get threshold movdqa xmm2, flimit pxor xmm1, xmm1 movdqa xmm7, xmm2 ;get mask psubusb xmm2, xmm4 psubusb xmm7, xmm6 pcmpeqb xmm2, xmm1 pcmpeqb xmm7, xmm1 por xmm7, xmm2 %endmacro %macro SECOND_2_ROWS 0 movdqa xmm6, xmm0 movdqa xmm4, xmm0 movdqa xmm2, xmm1 pavgb xmm1, xmm3 ;calculate absolute value psubusb xmm6, xmm2 psubusb xmm2, xmm0 psubusb xmm4, xmm3 psubusb xmm3, xmm0 paddusb xmm6, xmm2 paddusb xmm4, xmm3 pavgb xmm5, xmm1 ;get threshold movdqa xmm2, flimit pxor xmm1, xmm1 movdqa xmm3, xmm2 ;get mask psubusb xmm2, xmm6 psubusb xmm3, xmm4 pcmpeqb xmm2, xmm1 pcmpeqb xmm3, xmm1 por xmm7, xmm2 por xmm7, xmm3 pavgb xmm5, xmm0 ;decide if or not to use filtered value pand xmm0, xmm7 pandn xmm7, xmm5 paddusb xmm0, xmm7 %endmacro %macro UPDATE_FLIMIT 0 movdqu xmm2, XMMWORD PTR [rbx] movdqu [rsp], xmm2 add rbx, 16 %endmacro SECTION .text ;void vpx_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, ; unsigned char *dst_ptr, ; int src_pixels_per_line, ; int dst_pixels_per_line, ; int cols, ; int *flimits, ; int size ;) globalsym(vpx_post_proc_down_and_across_mb_row_sse2) sym(vpx_post_proc_down_and_across_mb_row_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rbx push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 ; put flimit on stack mov rbx, arg(5) ;flimits ptr UPDATE_FLIMIT %define flimit [rsp] mov rsi, arg(0) ;src_ptr mov rdi, arg(1) ;dst_ptr movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock .nextrow: xor rdx, rdx ;col .nextcol: ;load current and next 2 rows movdqu xmm0, XMMWORD PTR [rsi] movdqu xmm1, XMMWORD PTR [rsi + rax] movdqu xmm3, XMMWORD PTR [rsi + 2*rax] FIRST_2_ROWS ;load above 2 rows neg rax movdqu xmm1, XMMWORD PTR [rsi + 2*rax] movdqu xmm3, XMMWORD PTR [rsi + rax] SECOND_2_ROWS movdqu XMMWORD PTR [rdi], xmm0 neg rax ; positive stride add rsi, 16 add rdi, 16 add rdx, 16 cmp edx, dword arg(4) ;cols jge .downdone UPDATE_FLIMIT jmp .nextcol .downdone: ; done with the all cols, start the across filtering in place sub rsi, rdx sub rdi, rdx mov rbx, arg(5) ; flimits UPDATE_FLIMIT ; dup the first byte into the left border 8 times movq mm1, [rdi] punpcklbw mm1, mm1 punpcklwd mm1, mm1 punpckldq mm1, mm1 mov rdx, -8 movq [rdi+rdx], mm1 ; dup the last byte into the right border movsxd rdx, dword arg(4) movq mm1, [rdi + rdx + -1] punpcklbw mm1, mm1 punpcklwd mm1, mm1 punpckldq mm1, mm1 movq [rdi+rdx], mm1 xor rdx, rdx movq mm0, QWORD PTR [rdi-16]; movq mm1, QWORD PTR [rdi-8]; .acrossnextcol: movdqu xmm0, XMMWORD PTR [rdi + rdx] movdqu xmm1, XMMWORD PTR [rdi + rdx -2] movdqu xmm3, XMMWORD PTR [rdi + rdx -1] FIRST_2_ROWS movdqu xmm1, XMMWORD PTR [rdi + rdx +1] movdqu xmm3, XMMWORD PTR [rdi + rdx +2] SECOND_2_ROWS movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes movdq2q mm0, xmm0 psrldq xmm0, 8 movdq2q mm1, xmm0 add rdx, 16 cmp edx, dword arg(4) ;cols jge .acrossdone UPDATE_FLIMIT jmp .acrossnextcol .acrossdone: ; last 16 pixels movq QWORD PTR [rdi+rdx-16], mm0 cmp edx, dword arg(4) jne .throw_last_8 movq QWORD PTR [rdi+rdx-8], mm1 .throw_last_8: ; done with this rwo add rsi,rax ;next src line mov eax, dword arg(3) ;dst_pixels_per_line add rdi,rax ;next destination mov eax, dword arg(2) ;src_pixels_per_line mov rbx, arg(5) ;flimits UPDATE_FLIMIT dec rcx ;decrement count jnz .nextrow ;next row add rsp, 16 pop rsp ; begin epilog pop rdi pop rsi pop rbx RESTORE_XMM UNSHADOW_ARGS pop rbp ret %undef flimit ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, ; int pitch, int rows, int cols,int flimit) globalsym(vpx_mbpost_proc_across_ip_sse2) sym(vpx_mbpost_proc_across_ip_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 ; create flimit4 at [rsp] mov eax, dword ptr arg(4) ;flimit mov [rsp], eax mov [rsp+4], eax mov [rsp+8], eax mov [rsp+12], eax %define flimit4 [rsp] ;for(r=0;r<rows;r++) .ip_row_loop: xor rdx, rdx ;sumsq=0; xor rcx, rcx ;sum=0; mov rsi, arg(0); s ; dup the first byte into the left border 8 times movq mm1, [rsi] punpcklbw mm1, mm1 punpcklwd mm1, mm1 punpckldq mm1, mm1 mov rdi, -8 movq [rsi+rdi], mm1 ; dup the last byte into the right border movsxd rdx, dword arg(3) movq mm1, [rsi + rdx + -1] punpcklbw mm1, mm1 punpcklwd mm1, mm1 punpckldq mm1, mm1 movq [rsi+rdx], mm1 .ip_var_loop: ;for(i=-8;i<=6;i++) ;{ ; sumsq += s[i]*s[i]; ; sum += s[i]; ;} movzx eax, byte [rsi+rdi] add ecx, eax mul al add edx, eax add rdi, 1 cmp rdi, 6 jle .ip_var_loop ;mov rax, sumsq ;movd xmm7, rax movd xmm7, edx ;mov rax, sum ;movd xmm6, rax movd xmm6, ecx mov rsi, arg(0) ;s xor rcx, rcx movsxd rdx, dword arg(3) ;cols add rdx, 8 pxor mm0, mm0 pxor mm1, mm1 pxor xmm0, xmm0 .nextcol4: movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 punpcklbw xmm1, xmm0 ; expanding punpcklbw xmm2, xmm0 ; expanding punpcklwd xmm1, xmm0 ; expanding to dwords punpcklwd xmm2, xmm0 ; expanding to dwords psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 paddd xmm6, xmm2 paddd xmm7, xmm1 pshufd xmm6, xmm6, 0 ; duplicate the last ones pshufd xmm7, xmm7, 0 ; duplicate the last ones psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared paddd xmm6, xmm4 paddd xmm7, xmm3 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared paddd xmm7, xmm3 paddd xmm6, xmm4 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared paddd xmm7, xmm3 paddd xmm6, xmm4 movdqa xmm3, xmm6 pmaddwd xmm3, xmm3 movdqa xmm5, xmm7 pslld xmm5, 4 psubd xmm5, xmm7 psubd xmm5, xmm3 psubd xmm5, flimit4 psrad xmm5, 31 packssdw xmm5, xmm0 packsswb xmm5, xmm0 movd xmm1, DWORD PTR [rsi+rcx] movq xmm2, xmm1 punpcklbw xmm1, xmm0 punpcklwd xmm1, xmm0 paddd xmm1, xmm6 paddd xmm1, [GLOBAL(four8s)] psrad xmm1, 4 packssdw xmm1, xmm0 packuswb xmm1, xmm0 pand xmm1, xmm5 pandn xmm5, xmm2 por xmm5, xmm1 movd [rsi+rcx-8], mm0 movq mm0, mm1 movdq2q mm1, xmm5 psrldq xmm7, 12 psrldq xmm6, 12 add rcx, 4 cmp rcx, rdx jl .nextcol4 ;s+=pitch; movsxd rax, dword arg(1) add arg(0), rax sub dword arg(2), 1 ;rows-=1 cmp dword arg(2), 0 jg .ip_row_loop add rsp, 16 pop rsp ; begin epilog pop rdi pop rsi RESTORE_GOT RESTORE_XMM UNSHADOW_ARGS pop rbp ret %undef flimit4 SECTION_RODATA align 16 four8s: times 4 dd 8