ref: 9ba1d60bd1b0ff097b7fc88958be7ba7b589e7ca
dir: /third_party/libyuv/source/scale_win.cc/
/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "third_party/libyuv/include/libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for Visual C x86. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Offsets for source bytes 0 to 9 static uvec8 kShuf0 = { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. static uvec8 kShuf1 = { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static uvec8 kShuf2 = { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 0 to 10 static uvec8 kShuf01 = { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. static uvec8 kShuf11 = { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static uvec8 kShuf21 = { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; // Coefficients for source bytes 0 to 10 static uvec8 kMadd01 = { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; // Coefficients for source bytes 10 to 21 static uvec8 kMadd11 = { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; // Coefficients for source bytes 21 to 31 static uvec8 kMadd21 = { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; // Coefficients for source bytes 21 to 31 static vec16 kRound34 = { 2, 2, 2, 2, 2, 2, 2, 2 }; static uvec8 kShuf38a = { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; static uvec8 kShuf38b = { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; // Arrange words 0,3,6 into 0,1,2 static uvec8 kShufAc = { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; // Arrange words 0,3,6 into 3,4,5 static uvec8 kShufAc3 = { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; // Scaling values for boxes of 3x3 and 2x3 static uvec16 kScaleAc33 = { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; // Arrange first value for pixels 0,1,2,3,4,5 static uvec8 kShufAb0 = { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; // Arrange second value for pixels 0,1,2,3,4,5 static uvec8 kShufAb1 = { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; // Arrange third value for pixels 0,1,2,3,4,5 static uvec8 kShufAb2 = { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; // Scaling values for boxes of 3x2 and 2x2 static uvec16 kScaleAb2 = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; // Reads 32 pixels, throws half away and writes 16 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 32x1 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 // average columns (32 to 16 pixels) psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 pand xmm2, xmm5 pand xmm3, xmm5 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + esi] movdqa xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 pand xmm2, xmm5 pand xmm3, xmm5 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop pop esi ret } } // Reads 32 pixels, throws half away and writes 16 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 32x1 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 // average columns (32 to 16 pixels) psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 pand xmm2, xmm5 pand xmm3, xmm5 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 pand xmm2, xmm5 pand xmm3, xmm5 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop pop esi ret } } // Point samples 32 pixels to 8 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] jg wloop ret } } // Blends 32x4 rectangle to 8x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_ptr mov esi, [esp + 8 + 8] // src_stride mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + esi] movdqa xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, [eax + esi * 2] movdqa xmm3, [eax + esi * 2 + 16] movdqa xmm4, [eax + edi] movdqa xmm5, [eax + edi + 16] lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 pavgb xmm0, xmm2 pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 pand xmm2, xmm7 pand xmm3, xmm7 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) psrlw xmm0, 8 pand xmm2, xmm7 pavgw xmm0, xmm2 packuswb xmm0, xmm0 sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] jg wloop pop edi pop esi ret } } // Point samples 32 pixels to 24 pixels. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width movdqa xmm3, kShuf0 movdqa xmm4, kShuf1 movdqa xmm5, kShuf2 align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 pshufb xmm0, xmm3 pshufb xmm1, xmm4 pshufb xmm2, xmm5 movq qword ptr [edx], xmm0 movq qword ptr [edx + 8], xmm1 movq qword ptr [edx + 16], xmm2 lea edx, [edx + 24] sub ecx, 24 jg wloop ret } } // Blends 32x2 rectangle to 24x1 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Register usage: // xmm0 src_row 0 // xmm1 src_row 1 // xmm2 shuf 0 // xmm3 shuf 1 // xmm4 shuf 2 // xmm5 madd 0 // xmm6 madd 1 // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, kShuf01 movdqa xmm3, kShuf11 movdqa xmm4, kShuf21 movdqa xmm5, kMadd01 movdqa xmm6, kMadd11 movdqa xmm7, kRound34 align 4 wloop: movdqa xmm0, [eax] // pixels 0..7 movdqa xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 movdqa xmm0, [eax + 16] // pixels 16..23 movdqa xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 movdqa xmm1, kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx + 24] jg wloop pop esi ret } } // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, kShuf01 movdqa xmm3, kShuf11 movdqa xmm4, kShuf21 movdqa xmm5, kMadd01 movdqa xmm6, kMadd11 movdqa xmm7, kRound34 align 4 wloop: movdqa xmm0, [eax] // pixels 0..7 movdqa xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 movdqa xmm0, [eax + 16] // pixels 16..23 movdqa xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm4 movdqa xmm1, kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx+24] jg wloop pop esi ret } } // 3/8 point sampler // Scale 32 pixels to 12 __declspec(naked) __declspec(align(16)) void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width movdqa xmm4, kShuf38a movdqa xmm5, kShuf38b align 4 xloop: movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 sub ecx, 12 movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] jg xloop ret } } // Scale 16x3 pixels to 6x1 with interpolation __declspec(naked) __declspec(align(16)) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, kShufAc movdqa xmm3, kShufAc3 movdqa xmm4, kScaleAc33 pxor xmm5, xmm5 align 4 xloop: movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 movdqa xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 punpcklbw xmm6, xmm5 punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 movdqa xmm6, [eax + esi * 2] lea eax, [eax + 16] movhlps xmm7, xmm6 punpcklbw xmm6, xmm5 punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 paddusw xmm6, xmm0 psrldq xmm0, 2 paddusw xmm6, xmm0 pshufb xmm6, xmm2 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 paddusw xmm7, xmm1 psrldq xmm1, 2 paddusw xmm7, xmm1 pshufb xmm7, xmm3 paddusw xmm6, xmm7 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 sub ecx, 6 movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] jg xloop pop esi ret } } // Scale 16x2 pixels to 6x1 with interpolation __declspec(naked) __declspec(align(16)) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, kShufAb0 movdqa xmm3, kShufAb1 movdqa xmm4, kShufAb2 movdqa xmm5, kScaleAb2 align 4 xloop: movdqa xmm0, [eax] // average 2 rows into xmm0 pavgb xmm0, [eax + esi] lea eax, [eax + 16] movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 movdqa xmm6, xmm0 pshufb xmm6, xmm3 paddusw xmm1, xmm6 pshufb xmm0, xmm4 paddusw xmm1, xmm0 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 sub ecx, 6 movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] jg xloop pop esi ret } } // Reads 16xN bytes and produces 16 shorts at a time. // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. __declspec(naked) __declspec(align(16)) void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { __asm { push esi push edi push ebx push ebp mov esi, [esp + 16 + 4] // src_ptr mov edx, [esp + 16 + 8] // src_stride mov edi, [esp + 16 + 12] // dst_ptr mov ecx, [esp + 16 + 16] // dst_width mov ebx, [esp + 16 + 20] // height pxor xmm4, xmm4 dec ebx align 4 xloop: // first row movdqa xmm0, [esi] lea eax, [esi + edx] movdqa xmm1, xmm0 punpcklbw xmm0, xmm4 punpckhbw xmm1, xmm4 lea esi, [esi + 16] mov ebp, ebx test ebp, ebp je ydone // sum remaining rows align 4 yloop: movdqa xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row movdqa xmm3, xmm2 punpcklbw xmm2, xmm4 punpckhbw xmm3, xmm4 paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 sub ebp, 1 jg yloop align 4 ydone: movdqa [edi], xmm0 movdqa [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 jg xloop pop ebp pop ebx pop edi pop esi ret } } // Bilinear column filtering. SSSE3 version. // TODO(fbarchard): Port to Neon // TODO(fbarchard): Switch the following: // xor ebx, ebx // mov bx, word ptr [esi + eax] // 2 source x0 pixels // To // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels // when drmemory bug fixed. // https://code.google.com/p/drmemory/issues/detail?id=1396 __declspec(naked) __declspec(align(16)) void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { push ebx push esi push edi mov edi, [esp + 12 + 4] // dst_ptr mov esi, [esp + 12 + 8] // src_ptr mov ecx, [esp + 12 + 12] // dst_width movd xmm2, [esp + 12 + 16] // x movd xmm3, [esp + 12 + 20] // dx mov eax, 0x04040000 // shuffle to line up fractions with pixel. movd xmm5, eax pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 punpckldq xmm2, xmm0 // x0 x1 punpckldq xmm3, xmm3 // dx dx paddd xmm3, xmm3 // dx * 2, dx * 2 pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels movd xmm4, ebx pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 pxor xmm1, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // 8 bits, 2 pixels. movd ebx, xmm0 mov [edi], bx lea edi, [edi + 2] sub ecx, 2 // 2 pixels jge xloop2 align 4 xloop29: add ecx, 2 - 1 jl xloop99 // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. pshufb xmm2, xmm5 // 0011 pxor xmm2, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm2 // 16 bit psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // 8 bits movd ebx, xmm0 mov [edi], bl align 4 xloop99: pop edi pop esi pop ebx ret } } // Reads 16 pixels, duplicates them and writes 32 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { mov edx, [esp + 4] // dst_ptr mov eax, [esp + 8] // src_ptr mov ecx, [esp + 12] // dst_width align 4 wloop: movdqa xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 sub ecx, 32 movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] jg wloop ret } } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb // src_stride ignored mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 8x1 rectangle to 4x1. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb // src_stride ignored mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop ret } } // Blends 8x2 rectangle to 4x1. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // dst_width align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + esi] movdqa xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop pop esi ret } } // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { __asm { push ebx push edi mov eax, [esp + 8 + 4] // src_argb // src_stride ignored mov ebx, [esp + 8 + 12] // src_stepx mov edx, [esp + 8 + 16] // dst_argb mov ecx, [esp + 8 + 20] // dst_width lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] align 4 wloop: movd xmm0, [eax] movd xmm1, [eax + ebx] punpckldq xmm0, xmm1 movd xmm2, [eax + ebx * 2] movd xmm3, [eax + edi] lea eax, [eax + ebx * 4] punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop pop edi pop ebx ret } } // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { __asm { push ebx push esi push edi mov eax, [esp + 12 + 4] // src_argb mov esi, [esp + 12 + 8] // src_stride mov ebx, [esp + 12 + 12] // src_stepx mov edx, [esp + 12 + 16] // dst_argb mov ecx, [esp + 12 + 20] // dst_width lea esi, [eax + esi] // row1 pointer lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] align 4 wloop: movq xmm0, qword ptr [eax] // row0 4 pairs movhps xmm0, qword ptr [eax + ebx] movq xmm1, qword ptr [eax + ebx * 2] movhps xmm1, qword ptr [eax + edi] lea eax, [eax + ebx * 4] movq xmm2, qword ptr [esi] // row1 4 pairs movhps xmm2, qword ptr [esi + ebx] movq xmm3, qword ptr [esi + ebx * 2] movhps xmm3, qword ptr [esi + edi] lea esi, [esi + ebx * 4] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] jg wloop pop edi pop esi pop ebx ret } } // Column scaling unfiltered. SSE2 version. __declspec(naked) __declspec(align(16)) void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { push edi push esi mov edi, [esp + 8 + 4] // dst_argb mov esi, [esp + 8 + 8] // src_argb mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 paddd xmm2, xmm0 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 paddd xmm2, xmm0 // x3 x2 x1 x0 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 pextrw eax, xmm2, 1 // get x0 integer. pextrw edx, xmm2, 3 // get x1 integer. cmp ecx, 0 jle xloop99 sub ecx, 4 jl xloop49 // 4 Pixel loop. align 4 xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. pextrw edx, xmm2, 7 // get x3 integer. paddd xmm2, xmm3 // x += dx punpckldq xmm0, xmm1 // x0 x1 movd xmm1, [esi + eax * 4] // 1 source x2 pixels movd xmm4, [esi + edx * 4] // 1 source x3 pixels pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. punpckldq xmm1, xmm4 // x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 sub ecx, 4 // 4 pixels movdqu [edi], xmm0 lea edi, [edi + 16] jge xloop4 align 4 xloop49: test ecx, 2 je xloop29 // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. punpckldq xmm0, xmm1 // x0 x1 movq qword ptr [edi], xmm0 lea edi, [edi + 8] xloop29: test ecx, 1 je xloop99 // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 align 4 xloop99: pop esi pop edi ret } } // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. // TODO(fbarchard): Port to Neon // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each static uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; __declspec(naked) __declspec(align(16)) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { push esi push edi mov edi, [esp + 8 + 4] // dst_argb mov esi, [esp + 8 + 8] // src_argb mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx movdqa xmm4, kShuffleColARGB movdqa xmm5, kShuffleFractions pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 punpckldq xmm2, xmm0 // x0 x1 punpckldq xmm3, xmm3 // dx dx paddd xmm3, xmm3 // dx * 2, dx * 2 pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels psrlw xmm1, 9 // 7 bit fractions. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels pshufb xmm1, xmm5 // 0000000011111111 pshufb xmm0, xmm4 // arrange pixels into pairs pxor xmm1, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] sub ecx, 2 // 2 pixels jge xloop2 align 4 xloop29: add ecx, 2 - 1 jl xloop99 // 1 pixel remainder psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels pshufb xmm2, xmm5 // 00000000 pshufb xmm0, xmm4 // arrange pixels into pairs pxor xmm2, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. psrlw xmm0, 7 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 align 4 xloop99: pop edi pop esi ret } } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { mov edx, [esp + 4] // dst_argb mov eax, [esp + 8] // src_argb mov ecx, [esp + 12] // dst_width align 4 wloop: movdqa xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 sub ecx, 8 movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] jg wloop ret } } // Divide num by div and return as 16.16 fixed point result. __declspec(naked) __declspec(align(16)) int FixedDiv_X86(int num, int div) { __asm { mov eax, [esp + 4] // num cdq // extend num to 64 bits shld edx, eax, 16 // 32.16 shl eax, 16 idiv dword ptr [esp + 8] ret } } // Divide num by div and return as 16.16 fixed point result. __declspec(naked) __declspec(align(16)) int FixedDiv1_X86(int num, int div) { __asm { mov eax, [esp + 4] // num mov ecx, [esp + 8] // denom cdq // extend num to 64 bits shld edx, eax, 16 // 32.16 shl eax, 16 sub eax, 0x00010001 sbb edx, 0 sub ecx, 1 idiv ecx ret } } #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif