ref: 308e31a3ef97fa7a5bf9a232b15587955e5ec89f
parent: 1cf1ea94704b759aad8bacab0b7fa833ac7e452e
parent: 5227798c570af08d08dd6fdd7a3e96d5dc96977b
author: John Koleszar <jkoleszar@google.com>
date: Mon Apr 25 05:13:41 EDT 2011
Merge remote branch 'internal/upstream-experimental' into HEAD Conflicts: vp8/decoder/onyxd_int.h Change-Id: Icf445b589c2bc61d93d8c977379bbd84387d0488
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -884,6 +884,8 @@
link_with_cc=gcc
tune_cflags="-march="
setup_gnu_toolchain
+ #for 32 bit x86 builds, -O3 did not turn on this flag
+ enabled optimizations && check_add_cflags -fomit-frame-pointer
;;
esac
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -447,6 +447,8 @@
obj_int_extract)
tag Tool \
Name="VCCLCompilerTool" \
+ Optimization="2" \
+ FavorSizeorSpeed="1" \
AdditionalIncludeDirectories="$incs" \
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
RuntimeLibrary="$release_runtime" \
@@ -462,6 +464,8 @@
tag Tool \
Name="VCCLCompilerTool" \
+ Optimization="2" \
+ FavorSizeorSpeed="1" \
AdditionalIncludeDirectories="$incs" \
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
RuntimeLibrary="$release_runtime" \
@@ -476,6 +480,8 @@
tag Tool \
Name="VCCLCompilerTool" \
AdditionalIncludeDirectories="$incs" \
+ Optimization="2" \
+ FavorSizeorSpeed="1" \
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
RuntimeLibrary="$release_runtime" \
UsePrecompiledHeader="0" \
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -175,8 +175,6 @@
unsigned char need_to_clamp_mvs;
unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */
-
- unsigned char force_no_skip; /* encoder only */
} MB_MODE_INFO;
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -207,12 +207,12 @@
}
+/*encoder only*/
void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
{
int i;
- if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
- x->mode_info_context->mbmi.mode != SPLITMV)
+ if (x->mode_info_context->mbmi.mode != SPLITMV)
{
unsigned char *uptr, *vptr;
unsigned char *upred_ptr = &x->predictor[256];
@@ -257,61 +257,25 @@
}
/*encoder only*/
-void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
{
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = x->predictor;
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->block[0].pre_stride;
- if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
- x->mode_info_context->mbmi.mode != SPLITMV)
- {
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = x->predictor;
- int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
- int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
- int pre_stride = x->block[0].pre_stride;
+ ptr_base = x->pre.y_buffer;
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
- ptr_base = x->pre.y_buffer;
- ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
- }
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
}
else
{
- int i;
-
- if (x->mode_info_context->mbmi.partitioning < 3)
- {
- for (i = 0; i < 4; i++)
- {
- BLOCKD *d = &x->block[bbb[i]];
- build_inter_predictors4b(x, d, 16);
- }
-
- }
- else
- {
- for (i = 0; i < 16; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 16);
- else
- {
- vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
- vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
- }
-
- }
- }
+ RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
}
}
@@ -318,8 +282,7 @@
void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
{
- if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
- x->mode_info_context->mbmi.mode != SPLITMV)
+ if (x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
unsigned char *ptr_base;
@@ -535,61 +498,61 @@
-void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x)
{
- /*unsigned char *pred_ptr = x->block[0].predictor;
- unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/
- unsigned char *pred_ptr = x->predictor;
unsigned char *dst_ptr = x->dst.y_buffer;
- if (x->mode_info_context->mbmi.mode != SPLITMV)
- {
- int offset;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *uptr, *vptr;
- /*unsigned char *pred_ptr = x->predictor;
- unsigned char *upred_ptr = &x->predictor[256];
- unsigned char *vpred_ptr = &x->predictor[320];*/
- unsigned char *udst_ptr = x->dst.u_buffer;
- unsigned char *vdst_ptr = x->dst.v_buffer;
+ int offset;
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *uptr, *vptr;
+ unsigned char *udst_ptr = x->dst.u_buffer;
+ unsigned char *vdst_ptr = x->dst.v_buffer;
- int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
- int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
- int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
- ptr_base = x->pre.y_buffer;
- ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ ptr_base = x->pre.y_buffer;
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
+ }
+ else
+ {
+ RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
+ }
- mv_row = x->block[16].bmi.mv.as_mv.row;
- mv_col = x->block[16].bmi.mv.as_mv.col;
- pre_stride >>= 1;
- offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
- uptr = x->pre.u_buffer + offset;
- vptr = x->pre.v_buffer + offset;
+ mv_row = x->block[16].bmi.mv.as_mv.row;
+ mv_col = x->block[16].bmi.mv.as_mv.col;
+ pre_stride >>= 1;
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
- }
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
}
else
{
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
+ }
+}
+void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
+{
+ unsigned char *dst_ptr = x->dst.y_buffer;
+
+ if (x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vp8_build_inter16x16_predictors_mb_s(x);
+ }
+ else
+ {
/* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
* if sth is wrong, go back to what it is in build_inter_predictors_mb.
*/
@@ -599,26 +562,21 @@
{
for (i = 0; i < 4; i++)
{
+ unsigned char *ptr_base;
+ unsigned char *ptr;
BLOCKD *d = &x->block[bbb[i]];
- /*build_inter_predictors4b(x, d, 16);*/
- {
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
+ ptr_base = *(d->base_pre);
+ ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- ptr_base = *(d->base_pre);
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-
- if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
+ else
+ {
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
+ }
}
}
else
@@ -633,7 +591,6 @@
/*build_inter_predictors2b(x, d0, 16);*/
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d0->predictor;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
@@ -665,7 +622,6 @@
/*build_inter_predictors2b(x, d0, 8);*/
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d0->predictor;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -14,8 +14,9 @@
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x);
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -12,8 +12,6 @@
#ifndef _PTHREAD_EMULATION
#define _PTHREAD_EMULATION
-#define VPXINFINITE 10000 /* 10second. */
-
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
/* Thread management macros */
@@ -28,7 +26,7 @@
#define pthread_t HANDLE
#define pthread_attr_t DWORD
#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
-#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
+#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
#define thread_sleep(nms) Sleep(nms)
#define pthread_cancel(thread) terminate_thread(thread,0)
@@ -62,7 +60,7 @@
#define sem_t HANDLE
#define pause(voidpara) __asm PAUSE
#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL)
-#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE))
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
#define sem_post(sem) SetEvent(*sem)
#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
#define thread_sleep(nms) Sleep(nms)
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -32,9 +32,6 @@
mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
movd xmm4, [rax]
movd xmm5, [rdx]
@@ -43,9 +40,12 @@
pmullw xmm4, xmm5
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
; clear coeffs
- movd [rax], xmm7
- movd [rax+32], xmm7
+ movd [rax], xmm5
+ movd [rax+32], xmm5
;pshufb
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
@@ -62,10 +62,10 @@
lea rcx, [3*rcx]
movq xmm3, [rax+rcx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride
@@ -77,10 +77,10 @@
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; store blocks back out
movq [rax], xmm0
@@ -102,6 +102,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -347,6 +348,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -377,8 +379,8 @@
mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
@@ -398,10 +400,10 @@
psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
@@ -410,10 +412,10 @@
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; Load destination stride before writing out,
; doesn't need to persist
@@ -441,6 +443,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -692,6 +695,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,7 +17,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- SAVE_XMM
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
@@ -41,7 +41,7 @@
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm3 ;d1 a1
punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm7, eax
+ movd xmm6, eax
movdqa xmm1, xmm4 ;c1 b1
paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
@@ -66,7 +66,7 @@
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm4 ;ip[4] ip[0]
- pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03
+ pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
@@ -90,8 +90,8 @@
punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm7
- paddw xmm1, xmm7
+ paddw xmm5, xmm6
+ paddw xmm1, xmm6
psraw xmm5, 3
psraw xmm1, 3
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -288,7 +288,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -338,7 +338,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -584,7 +584,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -634,7 +634,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1024,7 +1024,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1091,7 +1091,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1249,7 +1249,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1318,7 +1318,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1386,7 +1386,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1503,7 +1503,7 @@
push rbp ; save old base pointer value.
mov rbp, rsp ; set new base pointer value.
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx ; save callee-saved reg
push rsi
push rdi
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -26,7 +26,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -256,7 +256,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -456,7 +456,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -67,7 +67,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM
+ SAVE_XMM 7
push rsi
push rdi
; end prolog
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -37,7 +37,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -157,7 +157,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -333,7 +333,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -428,7 +428,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -538,7 +538,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -651,7 +651,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -816,7 +816,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -908,7 +908,6 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- ;SAVE_XMM ;xmm6, xmm7 are not used here.
GET_GOT rbx
push rsi
push rdi
@@ -948,7 +947,6 @@
pop rdi
pop rsi
RESTORE_GOT
- ;RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -969,7 +967,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1238,7 +1236,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,6 +39,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -107,6 +108,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -162,6 +164,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -179,7 +182,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -286,6 +289,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -393,6 +397,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -413,6 +418,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -508,6 +514,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -580,6 +587,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -598,6 +606,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -670,6 +679,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -718,6 +728,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -874,7 +885,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1137,7 +1148,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -120,7 +120,7 @@
}
else
{
- vp8_build_inter_predictors_mb_s(xd);
+ vp8_build_inter16x16_predictors_mb_s(xd);
}
}
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -75,7 +75,6 @@
pbi->common.current_video_frame = 0;
pbi->ready_for_new_data = 1;
- pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
#if CONFIG_MULTITHREAD
pbi->max_threads = oxcf->max_threads;
vp8_decoder_create_threads(pbi);
@@ -255,7 +254,6 @@
VP8D_COMP *pbi = (VP8D_COMP *) ptr;
VP8_COMMON *cm = &pbi->common;
int retcode = 0;
- struct vpx_usec_timer timer;
/*if(pbi->ready_for_new_data == 0)
return -1;*/
@@ -320,8 +318,6 @@
pbi->common.error.setjmp = 1;
- vpx_usec_timer_start(&timer);
-
/*cm->current_video_frame++;*/
pbi->Source = source;
pbi->source_sz = size;
@@ -382,15 +378,9 @@
if(pbi->common.filter_level)
{
- struct vpx_usec_timer lpftimer;
- vpx_usec_timer_start(&lpftimer);
/* Apply the loop filter if appropriate. */
-
vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
- vpx_usec_timer_mark(&lpftimer);
- pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
-
cm->last_frame_type = cm->frame_type;
cm->last_filter_type = cm->filter_type;
cm->last_sharpness_level = cm->sharpness_level;
@@ -402,11 +392,6 @@
vp8_clear_system_state();
-
- vpx_usec_timer_mark(&timer);
- pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
-
- pbi->time_decoding += pbi->decode_microseconds;
/*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -82,11 +82,6 @@
unsigned int source_sz;
unsigned char *segmentation_map;
- unsigned int CPUFreq;
- unsigned int decode_microseconds;
- unsigned int time_decoding;
- unsigned int time_loop_filtering;
-
#if CONFIG_MULTITHREAD
/* variable for threading */
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -122,7 +122,7 @@
}
else
{
- vp8_build_inter_predictors_mb_s(xd);
+ vp8_build_inter16x16_predictors_mb_s(xd);
}
return;
}
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -34,7 +34,7 @@
// 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
short *quant;
short *quant_fast;
- short *quant_shift;
+ unsigned char *quant_shift;
short *zbin;
short *zrun_zbin_boost;
short *round;
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -155,7 +155,7 @@
#define EXACT_QUANT
#ifdef EXACT_QUANT
static void vp8cx_invert_quant(int improved_quant, short *quant,
- short *shift, short d)
+ unsigned char *shift, short d)
{
if(improved_quant)
{
@@ -1519,7 +1519,7 @@
cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
}
- if (!x->skip && !x->e_mbd.mode_info_context->mbmi.force_no_skip)
+ if (!x->skip)
{
vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
@@ -1529,7 +1529,7 @@
}
else
- vp8_stuff_inter16x16(x);
+ vp8_build_inter_predictors_mb_s(xd);
}
if (!x->skip)
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -196,40 +196,7 @@
}
-void vp8_stuff_inter16x16(MACROBLOCK *x)
-{
- vp8_build_inter_predictors_mb_s(&x->e_mbd);
- /*
- // recon = copy from predictors to destination
- {
- BLOCKD *b = &x->e_mbd.block[0];
- unsigned char *pred_ptr = b->predictor;
- unsigned char *dst_ptr = *(b->base_dst) + b->dst;
- int stride = b->dst_stride;
- int i;
- for(i=0;i<16;i++)
- vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
-
- b = &x->e_mbd.block[16];
- pred_ptr = b->predictor;
- dst_ptr = *(b->base_dst) + b->dst;
- stride = b->dst_stride;
-
- for(i=0;i<8;i++)
- vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
-
- b = &x->e_mbd.block[20];
- pred_ptr = b->predictor;
- dst_ptr = *(b->base_dst) + b->dst;
- stride = b->dst_stride;
-
- for(i=0;i<8;i++)
- vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
- }
- */
-}
-
#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
typedef struct vp8_token_state vp8_token_state;
@@ -635,7 +602,7 @@
/* this funciton is used by first pass only */
void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
- vp8_build_inter_predictors_mby(&x->e_mbd);
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd);
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -95,8 +95,6 @@
struct VP8_ENCODER_RTCD;
void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-extern void vp8_stuff_inter16x16(MACROBLOCK *x);
-
void vp8_build_dcblock(MACROBLOCK *b);
void vp8_transform_mb(MACROBLOCK *mb);
void vp8_transform_mbuv(MACROBLOCK *x);
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -194,13 +194,13 @@
#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse;}}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
#define MIN(x,y) (((x)<(y))?(x):(y))
#define MAX(x,y) (((x)>(y))?(x):(y))
//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
{
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
unsigned char *z = (*(b->base_src) + b->src);
@@ -226,7 +226,7 @@
bestmv->col <<= 3;
// calculate central point error
- besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+ besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
@@ -316,7 +316,7 @@
#undef CHECK_BETTER
#undef MIN
#undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
{
int bestmse = INT_MAX;
MV startmv;
@@ -345,7 +345,7 @@
startmv = *bestmv;
// calculate central point error
- bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
*distortion = bestmse;
bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
@@ -360,6 +360,7 @@
*bestmv = this_mv;
bestmse = left;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col += 8;
@@ -371,6 +372,7 @@
*bestmv = this_mv;
bestmse = right;
*distortion = thismse;
+ *sse1 = sse;
}
// go up then down and check error
@@ -384,6 +386,7 @@
*bestmv = this_mv;
bestmse = up;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.row += 8;
@@ -395,6 +398,7 @@
*bestmv = this_mv;
bestmse = down;
*distortion = thismse;
+ *sse1 = sse;
}
@@ -436,6 +440,7 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
// }
@@ -473,6 +478,7 @@
*bestmv = this_mv;
bestmse = left;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col += 4;
@@ -484,6 +490,7 @@
*bestmv = this_mv;
bestmse = right;
*distortion = thismse;
+ *sse1 = sse;
}
// go up then down and check error
@@ -507,6 +514,7 @@
*bestmv = this_mv;
bestmse = up;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.row += 4;
@@ -518,6 +526,7 @@
*bestmv = this_mv;
bestmse = down;
*distortion = thismse;
+ *sse1 = sse;
}
@@ -608,12 +617,13 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
return bestmse;
}
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
{
int bestmse = INT_MAX;
MV startmv;
@@ -640,7 +650,7 @@
startmv = *bestmv;
// calculate central point error
- bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
*distortion = bestmse;
bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
@@ -655,6 +665,7 @@
*bestmv = this_mv;
bestmse = left;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col += 8;
@@ -666,6 +677,7 @@
*bestmv = this_mv;
bestmse = right;
*distortion = thismse;
+ *sse1 = sse;
}
// go up then down and check error
@@ -679,6 +691,7 @@
*bestmv = this_mv;
bestmse = up;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.row += 8;
@@ -690,6 +703,7 @@
*bestmv = this_mv;
bestmse = down;
*distortion = thismse;
+ *sse1 = sse;
}
// somewhat strangely not doing all the diagonals for half pel is slower than doing them.
@@ -741,6 +755,7 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col += 8;
@@ -752,6 +767,7 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col = (this_mv.col - 8) | 4;
@@ -764,6 +780,7 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
this_mv.col += 8;
@@ -775,6 +792,7 @@
*bestmv = this_mv;
bestmse = diag;
*distortion = thismse;
+ *sse1 = sse;
}
#endif
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -49,7 +49,7 @@
typedef int (fractional_mv_step_fp)
(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,
- int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion);
+ int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse);
extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1765,9 +1765,6 @@
cm->horiz_scale = cpi->horiz_scale;
cm->vert_scale = cpi->vert_scale ;
- // As per VP8
- cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;
-
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
if (cpi->oxcf.Sharpness > 7)
cpi->oxcf.Sharpness = 7;
@@ -1797,10 +1794,6 @@
vp8_alloc_compressor_data(cpi);
}
- // Clamp KF frame size to quarter of data rate
- if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
- cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
if (cpi->oxcf.fixed_q >= 0)
{
cpi->last_q[0] = cpi->oxcf.fixed_q;
@@ -2025,7 +2018,6 @@
cpi->frames_till_gf_update_due = 0;
cpi->key_frame_count = 1;
- cpi->tot_key_frame_bits = 0;
cpi->ni_av_qi = cpi->oxcf.worst_allowed_q;
cpi->ni_tot_qi = 0;
@@ -2051,7 +2043,6 @@
for (i = 0; i < KEY_FRAME_CONTEXT; i++)
{
- cpi->prior_key_frame_size[i] = cpi->intra_frame_target;
cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
}
@@ -4456,9 +4447,9 @@
vp8_clear_system_state(); //__asm emms;
if (cpi->total_coded_error_left != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
- "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
- "%10.3f %8ld\n",
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+ "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%10.3f %8d\n",
cpi->common.current_video_frame, cpi->this_frame_target,
cpi->projected_frame_size,
(cpi->projected_frame_size - cpi->this_frame_target),
@@ -4475,9 +4466,9 @@
(double)cpi->bits_left / cpi->total_coded_error_left,
cpi->tot_recode_hits);
else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
- "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
- "%8ld\n",
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+ "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%8d\n",
cpi->common.current_video_frame,
cpi->this_frame_target, cpi->projected_frame_size,
(cpi->projected_frame_size - cpi->this_frame_target),
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -248,17 +248,17 @@
{
DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
@@ -389,14 +389,11 @@
int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames
INT64 key_frame_count;
- INT64 tot_key_frame_bits;
- int prior_key_frame_size[KEY_FRAME_CONTEXT];
int prior_key_frame_distance[KEY_FRAME_CONTEXT];
int per_frame_bandwidth; // Current section per frame bandwidth target
int av_per_frame_bandwidth; // Average frame size target for clip
int min_frame_bandwidth; // Minimum allocation that should be used for any frame
int last_key_frame_size;
- int intra_frame_target;
int inter_frame_target;
double output_frame_rate;
long long last_time_stamp_seen;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,7 @@
extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse)
{
(void) b;
(void) d;
@@ -59,6 +59,7 @@
(void) vfp;
(void) mvcost;
(void) distortion;
+ (void) sse;
bestmv->row <<= 3;
bestmv->col <<= 3;
return 0;
@@ -443,7 +444,7 @@
int bestsme;
//int all_rds[MAX_MODES]; // Experimental debug code.
int best_mode_index = 0;
- int sse = INT_MAX;
+ unsigned int sse = INT_MAX;
MV mvp;
int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -796,7 +797,7 @@
}
if (bestsme < INT_MAX)
- cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2);
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse);
mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -827,7 +828,7 @@
x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;
if((this_mode != NEWMV) || !(have_subp_search))
- distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
+ distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse);
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -27,7 +27,7 @@
short *zbin_ptr = b->zbin;
short *round_ptr = b->round;
short *quant_ptr = b->quant_fast;
- short *quant_shift_ptr = b->quant_shift;
+ unsigned char *quant_shift_ptr = b->quant_shift;
short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
short *dequant_ptr = d->dequant;
@@ -112,7 +112,7 @@
short *zbin_ptr = b->zbin;
short *round_ptr = b->round;
short *quant_ptr = b->quant;
- short *quant_shift_ptr = b->quant_shift;
+ unsigned char *quant_shift_ptr = b->quant_shift;
short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
short *dequant_ptr = d->dequant;
@@ -166,7 +166,7 @@
int sz;
short *coeff_ptr;
short *quant_ptr;
- short *quant_shift_ptr;
+ unsigned char *quant_shift_ptr;
short *qcoeff_ptr;
short *dqcoeff_ptr;
short *dequant_ptr;
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1497,86 +1497,83 @@
return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS;
}
-void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+
+static int estimate_keyframe_frequency(VP8_COMP *cpi)
{
int i;
- int av_key_frames_per_second;
- // Average key frame frequency and size
- unsigned int total_weight = 0;
- unsigned int av_key_frame_frequency = 0;
- unsigned int av_key_frame_bits = 0;
+ // Average key frame frequency
+ int av_key_frame_frequency = 0;
- unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate);
- unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth);
-
- // Clear down mmx registers to allow floating point in what follows
- vp8_clear_system_state(); //__asm emms;
-
- // Update the count of total key frame bits
- cpi->tot_key_frame_bits += cpi->projected_frame_size;
-
- // First key frame at start of sequence is a special case. We have no frequency data.
+ /* First key frame at start of sequence is a special case. We have no
+ * frequency data.
+ */
if (cpi->key_frame_count == 1)
{
- av_key_frame_frequency = (int)cpi->output_frame_rate * 2; // Assume a default of 1 kf every 2 seconds
- av_key_frame_bits = cpi->projected_frame_size;
- av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; // Note output_frame_rate not cpi->output_frame_rate
+ /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+ * whichever is smaller.
+ */
+ av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+ if (av_key_frame_frequency > cpi->oxcf.key_freq)
+ av_key_frame_frequency = cpi->oxcf.key_freq;
+
+ cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+ = av_key_frame_frequency;
}
else
{
+ unsigned int total_weight = 0;
int last_kf_interval =
(cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
- // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes
+ /* reset keyframe context and calculate weighted average of last
+ * KEY_FRAME_CONTEXT keyframes
+ */
for (i = 0; i < KEY_FRAME_CONTEXT; i++)
{
if (i < KEY_FRAME_CONTEXT - 1)
- {
- cpi->prior_key_frame_size[i] = cpi->prior_key_frame_size[i+1];
- cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1];
- }
+ cpi->prior_key_frame_distance[i]
+ = cpi->prior_key_frame_distance[i+1];
else
- {
- cpi->prior_key_frame_size[i] = cpi->projected_frame_size;
cpi->prior_key_frame_distance[i] = last_kf_interval;
- }
- av_key_frame_bits += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i];
- av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i];
- total_weight += prior_key_frame_weight[i];
+ av_key_frame_frequency += prior_key_frame_weight[i]
+ * cpi->prior_key_frame_distance[i];
+ total_weight += prior_key_frame_weight[i];
}
- av_key_frame_bits /= total_weight;
av_key_frame_frequency /= total_weight;
- av_key_frames_per_second = output_frame_rate / av_key_frame_frequency;
}
+ return av_key_frame_frequency;
+}
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+ // Clear down mmx registers to allow floating point in what follows
+ vp8_clear_system_state();
+
// Do we have any key frame overspend to recover?
- if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+ // Two-pass overspend handled elsewhere.
+ if ((cpi->pass != 2)
+ && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
{
- // Update the count of key frame overspend to be recovered in subsequent frames
- // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly)
- // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits
- // allocated than those following other gfs.
- cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8;
- cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8;
- if(!av_key_frame_frequency)
- av_key_frame_frequency = 60;
+ int overspend;
- // Work out how much to try and recover per frame.
- // For one pass we estimate the number of frames to spread it over based upon past history.
- // For two pass we know how many frames there will be till the next kf.
- if (cpi->pass == 2)
- {
- if (cpi->frames_to_key > 16)
- cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key;
- else
- cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16;
- }
- else
- cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency;
+ /* Update the count of key frame overspend to be recovered in
+ * subsequent frames. A portion of the KF overspend is treated as gf
+ * overspend (and hence recovered more quickly) as the kf is also a
+ * gf. Otherwise the few frames following each kf tend to get more
+ * bits allocated than those following other gfs.
+ */
+ overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
+ cpi->kf_overspend_bits += overspend * 7 / 8;
+ cpi->gf_overspend_bits += overspend * 1 / 8;
+
+ /* Work out how much to try and recover per frame. */
+ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
+ / estimate_keyframe_frequency(cpi);
}
cpi->frames_since_key = 0;
@@ -1583,6 +1580,7 @@
cpi->last_key_frame_size = cpi->projected_frame_size;
cpi->key_frame_count++;
}
+
void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
{
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1271,13 +1271,14 @@
if (bestsme < INT_MAX)
{
int distortion;
+ unsigned int sse;
if (!cpi->common.full_pixel)
cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
- bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion);
+ bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion, &sse);
else
vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
- bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion);
+ bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion, &sse);
}
} /* NEW4X4 */
@@ -1817,9 +1818,6 @@
//int intermodecost[MAX_MODES];
MB_PREDICTION_MODE uv_intra_mode;
-
- int force_no_skip = 0;
-
MV mvp;
int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
int saddone=0;
@@ -1922,8 +1920,6 @@
int disable_skip = 0;
int other_cost = 0;
- force_no_skip = 0;
-
// Experimental debug code.
// Record of rd values recorded for this MB. -1 indicates not measured
//all_rds[mode_index] = -1;
@@ -2255,9 +2251,10 @@
x->mv_row_max = tmp_row_max;
if (bestsme < INT_MAX)
- {
- int dis; /* TODO: use dis in distortion calculation later. */
- cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis);
+ {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse);
}
mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
@@ -2297,7 +2294,7 @@
continue;
vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
- vp8_build_inter_predictors_mby(&x->e_mbd);
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd);
if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
x->skip = 1;
@@ -2304,7 +2301,8 @@
}
else if (x->encode_breakout)
{
- int sum, sse;
+ int sum;
+ unsigned int sse;
int threshold = (xd->block[0].dequant[1]
* xd->block[0].dequant[1] >>4);
@@ -2313,7 +2311,7 @@
VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)
(x->src.y_buffer, x->src.y_stride,
- x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);
+ x->e_mbd.predictor, 16, &sse, &sum);
if (sse < threshold)
{
@@ -2337,8 +2335,7 @@
distortion_uv = sse2;
disable_skip = 1;
- this_rd = RDCOST(x->rdmult, x->rddiv, rate2,
- distortion2);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
break;
}
@@ -2433,7 +2430,6 @@
{
// Note index of best mode so far
best_mode_index = mode_index;
- x->e_mbd.mode_info_context->mbmi.force_no_skip = force_no_skip;
if (this_mode <= B_PRED)
{
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -290,8 +290,8 @@
}
}
-const static long long c1 = 426148; // (256^2*(.01*255)^2
-const static long long c2 = 3835331; //(256^2*(.03*255)^2
+const static long long cc1 = 26634; // (64^2*(.01*255)^2
+const static long long cc2 = 239708; // (64^2*(.03*255)^2
static double similarity
(
@@ -303,11 +303,20 @@
int count
)
{
- long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+ long long ssim_n, ssim_d;
+ long long c1, c2;
- long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
- (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+ //scale the constants by number of pixels
+ c1 = (cc1*count*count)>>12;
+ c2 = (cc2*count*count)>>12;
+ ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr-
+ (long long) 2*sum_s*sum_r+c2);
+
+ ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+ ((long long)count*sum_sq_s-(long long)sum_s*sum_s +
+ (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ;
+
return ssim_n * 1.0 / ssim_d;
}
@@ -332,18 +341,33 @@
const vp8_variance_rtcd_vtable_t *rtcd)
{
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
- double ssim3;
- long long ssim_n;
- long long ssim_d;
+ long long ssim3;
+ long long ssim_n,ssim_n1,ssim_n2;
+ long long ssim_d,ssim_d1,ssim_d2;
+ long long ssim_t1,ssim_t2;
+ long long c1, c2;
+ // normalize by 256/64
+ c1 = cc1*16;
+ c2 = cc2*16;
+
rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
- ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+ ssim_n1 = (2*sum_s*sum_r+ c1);
- ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
- (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+ ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2);
- ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
- return (long)( 256*ssim3 * ssim3 );
+ ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1);
+
+ ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s +
+ (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ;
+
+ ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
+ ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
+
+ ssim3 = 256 *ssim_t1 * ssim_t2;
+ if(ssim3 <0 )
+ ssim3=0;
+ return (long)( ssim3 );
}
// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
// such that the window regions overlap block boundaries to penalize blocking
@@ -361,18 +385,20 @@
)
{
int i,j;
-
+ int samples =0;
double ssim_total=0;
- // we can sample points as frequently as we like start with 1 per 8x8
- for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+ // we can sample points as frequently as we like start with 1 per 4x4
+ for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
{
- for(j=0; j < width; j+=8 )
+ for(j=0; j < width-8; j+=4 )
{
- ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+ double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd);
+ ssim_total += v;
+ samples++;
}
}
- ssim_total /= (width/8 * height /8);
+ ssim_total /= samples;
return ssim_total;
}
@@ -405,4 +431,4 @@
*weight = 1;
return ssimv;
-}
+}
\ No newline at end of file
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -209,10 +209,11 @@
//if (bestsme > error_thresh && bestsme < INT_MAX)
{
int distortion;
+ unsigned int sse;
bestsme = cpi->find_fractional_mv_step(x, b, d,
&d->bmi.mv.as_mv, &best_ref_mv1,
x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
- mvcost, &distortion);
+ mvcost, &distortion, &sse);
}
#endif
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -229,16 +229,6 @@
x->mode_info_context->mbmi.dc_diff = 1;
-#if 0
-
- if (x->mbmi.force_no_skip)
- {
- x->mbmi.mb_skip_coeff = 1;
- //reset for next_mb.
- x->mbmi.force_no_skip = 0;
- }
-
-#endif
#if 1
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -33,6 +33,7 @@
%define input rcx
%define output rdx
%define pitch r8
+ SAVE_XMM 7, u
%else
%define input rdi
%define output rsi
@@ -53,6 +54,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
%endif
%endif
ret
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -22,33 +22,33 @@
; end prologue
mov rsi, arg(0) ;coeff_ptr
-
mov rdi, arg(1) ;dcoef_ptr
- movdqa xmm3, [rsi]
- movdqa xmm4, [rdi]
- movdqa xmm5, [rsi+16]
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
- movdqa xmm6, [rdi+16]
- psubw xmm3, xmm4
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
- psubw xmm5, xmm6
- pmaddwd xmm3, xmm3
- pmaddwd xmm5, xmm5
+ psubw xmm0, xmm1
+ psubw xmm2, xmm3
- paddd xmm3, xmm5
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm2, xmm2
- pxor xmm7, xmm7
- movdqa xmm0, xmm3
+ paddd xmm0, xmm2
- punpckldq xmm0, xmm7
- punpckhdq xmm3, xmm7
+ pxor xmm5, xmm5
+ movdqa xmm1, xmm0
- paddd xmm0, xmm3
- movdqa xmm3, xmm0
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
psrldq xmm0, 8
- paddd xmm0, xmm3
+ paddd xmm0, xmm1
movq rax, xmm0
@@ -208,6 +208,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
@@ -214,47 +215,47 @@
mov rsi, arg(0) ;coeff_ptr
- pxor xmm7, xmm7
+ pxor xmm6, xmm6
mov rdi, arg(1) ;dcoef_ptr
- pxor xmm2, xmm2
+ pxor xmm4, xmm4
- movd xmm1, dword ptr arg(2) ;dc
- por xmm1, xmm2
+ movd xmm5, dword ptr arg(2) ;dc
+ por xmm5, xmm4
- pcmpeqw xmm1, xmm7
+ pcmpeqw xmm5, xmm6
mov rcx, 16
mberror_loop:
- movdqa xmm3, [rsi]
- movdqa xmm4, [rdi]
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
- movdqa xmm5, [rsi+16]
- movdqa xmm6, [rdi+16]
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
- psubw xmm5, xmm6
- pmaddwd xmm5, xmm5
+ psubw xmm2, xmm3
+ pmaddwd xmm2, xmm2
- psubw xmm3, xmm4
- pand xmm3, xmm1
+ psubw xmm0, xmm1
+ pand xmm0, xmm5
- pmaddwd xmm3, xmm3
+ pmaddwd xmm0, xmm0
add rsi, 32
add rdi, 32
sub rcx, 1
- paddd xmm2, xmm5
+ paddd xmm4, xmm2
- paddd xmm2, xmm3
+ paddd xmm4, xmm0
jnz mberror_loop
- movdqa xmm0, xmm2
- punpckldq xmm0, xmm7
+ movdqa xmm0, xmm4
+ punpckldq xmm0, xmm6
- punpckhdq xmm2, xmm7
- paddd xmm0, xmm2
+ punpckhdq xmm4, xmm6
+ paddd xmm0, xmm4
movdqa xmm1, xmm0
psrldq xmm0, 8
@@ -265,6 +266,7 @@
pop rdi
pop rsi
; begin epilog
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -342,7 +344,7 @@
mov rdi, arg(1) ;d_ptr
mov rcx, 16
- pxor xmm7, xmm7
+ pxor xmm3, xmm3
mbuverror_loop:
@@ -352,7 +354,7 @@
psubw xmm1, xmm2
pmaddwd xmm1, xmm1
- paddd xmm7, xmm1
+ paddd xmm3, xmm1
add rsi, 16
add rdi, 16
@@ -361,7 +363,7 @@
jnz mbuverror_loop
pxor xmm0, xmm0
- movdqa xmm1, xmm7
+ movdqa xmm1, xmm3
movdqa xmm2, xmm1
punpckldq xmm1, xmm0
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,7 +17,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -20,7 +20,7 @@
sym(vp8_regular_quantize_b_sse2):
push rbp
mov rbp, rsp
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
%if ABI_IS_32BIT
@@ -142,7 +142,7 @@
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
; downshift by quant_shift[rc]
- movsx ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc]
+ movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
sar edi, cl ; also sets Z bit
je rq_zigzag_loop_%1 ; !y
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,6 +21,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
@@ -34,7 +35,7 @@
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
- pxor xmm7, xmm7
+ pxor xmm6, xmm6
x16x16sad_wmt_loop:
@@ -52,32 +53,33 @@
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
- movq xmm6, QWORD PTR [rsi+rax+8]
+ movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- punpcklbw xmm4, xmm6
+ punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
- paddw xmm7, xmm0
- paddw xmm7, xmm4
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
cmp rsi, rcx
jne x16x16sad_wmt_loop
- movq xmm0, xmm7
- psrldq xmm7, 8
+ movq xmm0, xmm6
+ psrldq xmm6, 8
- paddw xmm0, xmm7
+ paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -33,6 +33,7 @@
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
@@ -39,8 +40,8 @@
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
- %define result_ptr [rsp+8+4*8]
- %define max_err [rsp+8+4*8]
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_err [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
@@ -72,6 +73,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
%endif
%endif
ret
@@ -106,6 +108,7 @@
xchg rbx, rax
%else
%ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define r0_ptr rsi
@@ -113,7 +116,7 @@
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
- %define result_ptr [rsp+16+4*8]
+ %define result_ptr [rsp+xmm_stack_space+16+4*8]
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
@@ -151,6 +154,7 @@
%else
%ifidn __OUTPUT_FORMAT__,x64
pop rsi
+ RESTORE_XMM
%endif
%endif
ret
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -157,6 +157,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
push rsi
push rdi
push rcx
@@ -253,6 +254,7 @@
pop rcx
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -268,6 +270,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
push rsi
push rdi
push rcx
@@ -361,6 +364,7 @@
pop rcx
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -16,12 +16,12 @@
paddusw xmm14, xmm4 ; sum_r
movdqa xmm1, xmm3
pmaddwd xmm1, xmm1
- paddq xmm13, xmm1 ; sum_sq_s
+ paddd xmm13, xmm1 ; sum_sq_s
movdqa xmm2, xmm4
pmaddwd xmm2, xmm2
- paddq xmm12, xmm2 ; sum_sq_r
+ paddd xmm12, xmm2 ; sum_sq_r
pmaddwd xmm3, xmm4
- paddq xmm11, xmm3 ; sum_sxr
+ paddd xmm11, xmm3 ; sum_sxr
%endmacro
; Sum across the register %1 starting with q words
@@ -66,6 +66,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
push rsi
push rdi
; end prolog
@@ -115,19 +116,20 @@
SUM_ACROSS_Q xmm11
mov rdi,arg(4)
- movq [rdi], xmm15;
+ movd [rdi], xmm15;
mov rdi,arg(5)
- movq [rdi], xmm14;
+ movd [rdi], xmm14;
mov rdi,arg(6)
- movq [rdi], xmm13;
+ movd [rdi], xmm13;
mov rdi,arg(7)
- movq [rdi], xmm12;
+ movd [rdi], xmm12;
mov rdi,arg(8)
- movq [rdi], xmm11;
+ movd [rdi], xmm11;
; begin epilog
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -154,6 +156,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
push rsi
push rdi
; end prolog
@@ -174,11 +177,8 @@
NextRow2:
;grab source and reference pixels
- movq xmm5, [rsi]
- movq xmm6, [rdi]
-
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
punpcklbw xmm3, xmm0 ; low_s
punpcklbw xmm4, xmm0 ; low_r
@@ -197,19 +197,20 @@
SUM_ACROSS_Q xmm11
mov rdi,arg(4)
- movq [rdi], xmm15;
+ movd [rdi], xmm15;
mov rdi,arg(5)
- movq [rdi], xmm14;
+ movd [rdi], xmm14;
mov rdi,arg(6)
- movq [rdi], xmm13;
+ movd [rdi], xmm13;
mov rdi,arg(7)
- movq [rdi], xmm12;
+ movd [rdi], xmm12;
mov rdi,arg(8)
- movq [rdi], xmm11;
+ movd [rdi], xmm11;
; begin epilog
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,7 +77,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -26,7 +26,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -164,10 +164,10 @@
movdqa xmm6, [rdi+32]
movdqa xmm7, [rdi+48]
; += modifier
- paddw xmm4, xmm0
- paddw xmm5, xmm2
- paddw xmm6, xmm1
- paddw xmm7, xmm3
+ paddd xmm4, xmm0
+ paddd xmm5, xmm2
+ paddd xmm6, xmm1
+ paddd xmm7, xmm3
; write back
movdqa [rdi], xmm4
movdqa [rdi+16], xmm5
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,6 +85,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
push rbx
push rsi
push rdi
@@ -206,6 +207,7 @@
pop rdi
pop rsi
pop rbx
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -223,6 +225,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -321,6 +324,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -341,6 +345,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -506,6 +511,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -528,7 +534,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -805,6 +811,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -906,6 +913,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -925,7 +933,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1041,6 +1049,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1127,6 +1136,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -1146,7 +1156,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1254,6 +1264,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1338,6 +1349,7 @@
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -1357,7 +1369,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -34,7 +34,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -151,8 +151,8 @@
__asm__ __volatile__ ("pause \n\t")
#else
#if ARCH_X86_64
-/* No pause intrinsic for windows x64 */
-#define x86_pause_hint()
+#define x86_pause_hint()\
+ _mm_pause();
#else
#define x86_pause_hint()\
__asm pause
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -255,21 +255,48 @@
%define UNSHADOW_ARGS mov rsp, rbp
%endif
-; must keep XMM6:XMM15 (libvpx uses XMM6 and XMM7) on Win64 ABI
-; rsp register has to be aligned
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
%ifidn __OUTPUT_FORMAT__,x64
-%macro SAVE_XMM 0
- sub rsp, 32
- movdqa XMMWORD PTR [rsp], xmm6
- movdqa XMMWORD PTR [rsp+16], xmm7
+%macro SAVE_XMM 1-2 a
+ %if %1 < 6
+ %error Only xmm registers 6-15 must be preserved
+ %else
+ %assign last_xmm %1
+ %define movxmm movdq %+ %2
+ %assign xmm_stack_space ((last_xmm - 5) * 16)
+ sub rsp, xmm_stack_space
+ %assign i 6
+ %rep (last_xmm - 5)
+ movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+ %assign i i+1
+ %endrep
+ %endif
%endmacro
%macro RESTORE_XMM 0
- movdqa xmm6, XMMWORD PTR [rsp]
- movdqa xmm7, XMMWORD PTR [rsp+16]
- add rsp, 32
+ %ifndef last_xmm
+ %error RESTORE_XMM must be paired with SAVE_XMM n
+ %else
+ %assign i last_xmm
+ %rep (last_xmm - 5)
+ movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+ %assign i i-1
+ %endrep
+ add rsp, xmm_stack_space
+ ; there are a couple functions which return from multiple places.
+ ; otherwise, we could uncomment these:
+ ; %undef last_xmm
+ ; %undef xmm_stack_space
+ ; %undef movxmm
+ %endif
%endmacro
%else
-%macro SAVE_XMM 0
+%macro SAVE_XMM 1-2
%endmacro
%macro RESTORE_XMM 0
%endmacro
--
⑨