ref: 3ec043a795cdf4dfa4cf31a9a0c82cbeef52c866
dir: /vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm/
; ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" %macro GET_PARAM_4 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm3, [rdx] ;load filters psrldq xmm3, 6 packsswb xmm3, xmm3 pshuflw xmm3, xmm3, 0b ;k3_k4 movd xmm2, ecx ;rounding_shift pshufd xmm2, xmm2, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_4 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm3 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack to byte %if %1 movd xmm1, [rdi] pavgb xmm0, xmm1 %endif movd [rdi], xmm0 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm7, [rdx] ;load filters psrldq xmm7, 6 packsswb xmm7, xmm7 pshuflw xmm7, xmm7, 0b ;k3_k4 punpcklwd xmm7, xmm7 movd xmm6, ecx ;rounding_shift pshufd xmm6, xmm6, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_8 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack back to byte %if %1 movq xmm1, [rdi] pavgb xmm0, xmm1 %endif movq [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro APPLY_FILTER_16 1 punpcklbw xmm0, xmm1 punpckhbw xmm2, xmm1 pmaddubsw xmm0, xmm7 pmaddubsw xmm2, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) pmulhrsw xmm2, xmm6 packuswb xmm0, xmm2 ;pack back to byte %if %1 movdqu xmm1, [rdi] pavgb xmm0, xmm1 %endif movdqu [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm SECTION .text globalsym(vpx_filter_block1d4_v2_ssse3) sym(vpx_filter_block1d4_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movd xmm0, [rsi] ;load src movd xmm1, [rsi + rax] APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d8_v2_ssse3) sym(vpx_filter_block1d8_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movq xmm0, [rsi] ;0 movq xmm1, [rsi + rax] ;1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d16_v2_ssse3) sym(vpx_filter_block1d16_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm1, [rsi + rax] ;1 movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d4_v2_avg_ssse3) sym(vpx_filter_block1d4_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movd xmm0, [rsi] ;load src movd xmm1, [rsi + rax] APPLY_FILTER_4 1 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d8_v2_avg_ssse3) sym(vpx_filter_block1d8_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movq xmm0, [rsi] ;0 movq xmm1, [rsi + rax] ;1 APPLY_FILTER_8 1 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d16_v2_avg_ssse3) sym(vpx_filter_block1d16_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm1, [rsi + rax] ;1 movdqa xmm2, xmm0 APPLY_FILTER_16 1 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d4_h2_ssse3) sym(vpx_filter_block1d4_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d8_h2_ssse3) sym(vpx_filter_block1d8_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d16_h2_ssse3) sym(vpx_filter_block1d16_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 1] movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d4_h2_avg_ssse3) sym(vpx_filter_block1d4_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_4 1 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d8_h2_avg_ssse3) sym(vpx_filter_block1d8_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_8 1 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(vpx_filter_block1d16_h2_avg_ssse3) sym(vpx_filter_block1d16_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 1] movdqa xmm2, xmm0 APPLY_FILTER_16 1 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret