shithub: libvpx

--- a/.mailmap

+++ b/.mailmap

@@ -2,3 +2,4 @@

 Johann Koenig <johannkoenig@google.com>

 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>

 Tom Finegan <tomfinegan@google.com>

+Ralph Giles <giles@xiph.org> <giles@entropywave.com>

--- a/AUTHORS

+++ b/AUTHORS

@@ -4,8 +4,11 @@

 Aaron Watry <awatry@gmail.com>

 Adrian Grange <agrange@google.com>

 Alex Converse <alex.converse@gmail.com>

+Alexis Ballier <aballier@gentoo.org>

+Alok Ahuja <waveletcoeff@gmail.com>

 Andoni Morales Alastruey <ylatuya@gmail.com>

 Andres Mejia <mcitadel@gmail.com>

+Aron Rosenberg <arosenberg@logitech.com>

 Attila Nagy <attilanagy@google.com>

 Fabio Pedretti <fabio.ped@libero.it>

 Frank Galligan <fgalligan@google.com>

@@ -22,12 +25,15 @@

 Jim Bankoski <jimbankoski@google.com>

 Johann Koenig <johannkoenig@google.com>

 John Koleszar <jkoleszar@google.com>

+Joshua Bleecher Snyder <josh@treelinelabs.com>

 Justin Clift <justin@salasaga.org>

 Justin Lebar <justin.lebar@gmail.com>

+Lou Quillio <louquillio@google.com>

 Luca Barbato <lu_zero@gentoo.org>

 Makoto Kato <makoto.kt@gmail.com>

 Martin Ettl <ettl.martin78@googlemail.com>

 Michael Kohler <michaelkohler@live.com>

+Mike Hommey <mhommey@mozilla.com>

 Mikhal Shemer <mikhal@google.com>

 Pascal Massimino <pascal.massimino@gmail.com>

 Patrik Westin <patrik.westin@gmail.com>

@@ -34,8 +40,14 @@

 Paul Wilkins <paulwilkins@google.com>

 Pavol Rusnak <stick@gk2.sk>

 Philip Jägenstedt <philipj@opera.com>

+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>

+Ralph Giles <giles@xiph.org>

+Ronald S. Bultje <rbultje@google.com>

 Scott LaVarnway <slavarnway@google.com>

+Stefan Holmer <holmer@google.com>

+Taekhyun Kim <takim@nvidia.com>

 Tero Rintaluoma <teror@google.com>

+Thijs Vermeir <thijsvermeir@gmail.com>

 Timothy B. Terriberry <tterribe@xiph.org>

 Tom Finegan <tomfinegan@google.com>

 Yaowu Xu <yaowu@google.com>

--- a/CHANGELOG

+++ b/CHANGELOG

@@ -1,3 +1,85 @@

+2011-08-02 v0.9.7 "Cayuga"

+  Our third named release, focused on a faster, higher quality, encoder.

+  - Upgrading:

+    This release is backwards compatible with Aylesbury (v0.9.5) and

+    Bali (v0.9.6). Users of older releases should refer to the Upgrading

+    notes in this document for that release.

+  - Enhancements:

+          Stereo 3D format support for vpxenc

+          Runtime detection of available processor cores.

+          Allow specifying --end-usage by enum name

+          vpxdec: test for frame corruption

+          vpxenc: add quantizer histogram display

+          vpxenc: add rate histogram display

+          Set VPX_FRAME_IS_DROPPABLE

+          update configure for ios sdk 4.3

+          Avoid text relocations in ARM vp8 decoder

+          Generate a vpx.pc file for pkg-config.

+          New ways of passing encoded data between encoder and decoder.

+  - Speed:

+      This release includes across-the-board speed improvements to the

+      encoder. On x86, these measure at approximately 11.5% in Best mode,

+      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).

+      On ARM Cortex A9 with Neon extensions, real-time encoding of video

+      telephony content is 35% faster than Bali on single core and 48%

+      faster on multi-core. On the NVidia Tegra2 platform, real time

+      encoding is 40% faster than Bali.

+      Decoder speed was not a priority for this release, but improved

+      approximately 8.4% on x86.

+          Reduce motion vector search on alt-ref frame.

+          Encoder loopfilter running in its own thread

+          Reworked loopfilter to precalculate more parameters

+          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().

+          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.

+          Removed redundant checks

+          Reduced structure sizes

+          utilize preload in ARMv6 MC/LPF/Copy routines

+          ARM optimized quantization, dfct, variance, subtract

+          Increase chrow row alignment to 16 bytes.

+          disable trellis optimization for first pass

+          Write SSSE3 sub-pixel filter function

+          Improve SSE2 half-pixel filter funtions

+          Add vp8_sub_pixel_variance16x8_ssse3 function

+          Reduce unnecessary distortion computation

+          Use diamond search to replace full search

+          Preload reference area in sub-pixel motion search (real-time mode)

+  - Quality:

+      This release focused primarily on one-pass use cases, including

+      video conferencing. Low latency data rate control was significantly

+      improved, improving streamability over bandwidth constrained links.

+      Added support for error concealment, allowing frames to maintain

+      visual quality in the presence of substantial packet loss.

+          Add rc_max_intra_bitrate_pct control

+          Limit size of initial keyframe in one-pass.

+          Improve framerate adaptation

+          Improved 1-pass CBR rate control

+          Improved KF insertion after fades to still.

+          Improved key frame detection.

+          Improved activity masking (lower PSNR impact for same SSIM boost)

+          Improved interaction between GF and ARFs

+          Adding error-concealment to the decoder.

+          Adding support for independent partitions

+          Adjusted rate-distortion constants

+  - Bug Fixes:

+          Removed firstpass motion map

+          Fix parallel make install

+          Fix multithreaded encoding for 1 MB wide frame

+          Fixed iwalsh_neon build problems with RVDS4.1

+          Fix semaphore emulation, spin-wait intrinsics on Windows

+          Fix build with xcode4 and simplify GLOBAL.

+          Mark ARM asm objects as allowing a non-executable stack.

+          Fix vpxenc encoding incorrect webm file header on big endian

 2011-03-07 v0.9.6 "Bali"

   Our second named release, focused on a faster, higher quality, encoder.

--- a/build/make/Makefile

+++ b/build/make/Makefile

@@ -336,6 +336,7 @@

     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh

     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh

     DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules

+    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat

     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh

     # Include obj_int_extract if we use offsets from asm_*_offsets

     DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -952,6 +952,10 @@

     # shared objects

     enabled gcc && enabled pic && check_add_cflags -fPIC

+    # Work around longjmp interception on glibc >= 2.11, to improve binary

+    # compatibility. See http://code.google.com/p/webm/issues/detail?id=166

+    enabled linux && check_add_cflags -D_FORTIFY_SOURCE=0

     # Check for strip utility variant

     ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip

--- a/libs.mk

+++ b/libs.mk

@@ -35,6 +35,7 @@

   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))

   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))

   CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h

+  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk

   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h

   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%

   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h

@@ -47,6 +48,7 @@

   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))

   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))

   CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h

+  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8dx_arm.mk

   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h

   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%

   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h

@@ -89,6 +91,7 @@

 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h

+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h

 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c

@@ -100,7 +103,7 @@

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c

 endif

 CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c

-CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm

+CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h

 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com

 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc

 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec

@@ -177,7 +180,7 @@

 else

 LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))

 OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)

-LIBS-$(CONFIG_STATIC) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a

+LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a

 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

 BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))

@@ -269,20 +272,20 @@

 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))

-    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.S

+    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S

 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(VP8_PREFIX)common/asm_com_offsets.c.S: vp8/common/asm_com_offsets.c

-    CLEAN-OBJS += asm_com_offsets.asm $(VP8_PREFIX)common/asm_com_offsets.c.S

+    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S

-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S

 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(VP8_PREFIX)encoder/asm_enc_offsets.c.S: vp8/encoder/asm_enc_offsets.c

-    CLEAN-OBJS += asm_enc_offsets.asm $(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S

-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S

 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(VP8_PREFIX)decoder/asm_dec_offsets.c.S: vp8/decoder/asm_dec_offsets.c

-    CLEAN-OBJS += asm_dec_offsets.asm $(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S

 else

   ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))

     asm_com_offsets.asm: obj_int_extract

--- a/vp8/common/asm_com_offsets.c

+++ b/vp8/common/asm_com_offsets.c

@@ -9,6 +9,8 @@

*/

+#include "vpx_config.h"

+#include "vpx/vpx_codec.h"

 #include "vpx_ports/asm_offsets.h"

 #include "vpx_scale/yv12config.h"

@@ -25,8 +27,14 @@

 DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));

 DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));

 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));

+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);

END

 /* add asserts for any offset that is not supported by assembly code */

 /* add asserts for any size that is not supported by assembly code */

+#if HAVE_ARMV7

+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */

+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)

+#endif

--- a/vp8/common/onyxc_int.h

+++ b/vp8/common/onyxc_int.h

@@ -19,7 +19,9 @@

 #include "entropy.h"

 #include "idct.h"

 #include "recon.h"

+#if CONFIG_POSTPROC

 #include "postproc.h"

+#endif

 /*#ifdef PACKET_TESTING*/

 #include "header.h"

@@ -75,7 +77,9 @@

     vp8_recon_rtcd_vtable_t       recon;

     vp8_subpix_rtcd_vtable_t      subpix;

     vp8_loopfilter_rtcd_vtable_t  loopfilter;

+#if CONFIG_POSTPROC

     vp8_postproc_rtcd_vtable_t    postproc;

+#endif

     int                           flags;

 #else

     int unused;

@@ -202,7 +206,9 @@

 #if CONFIG_MULTITHREAD

     int processor_core_count;

 #endif

+#if CONFIG_POSTPROC

     struct postproc_state  postproc_state;

+#endif

 } VP8_COMMON;

 #endif

--- a/vp8/decoder/arm/arm_dsystemdependent.c

+++ b/vp8/decoder/arm/arm_dsystemdependent.c

@@ -13,7 +13,6 @@

 #include "vpx_ports/arm.h"

 #include "vp8/common/blockd.h"

 #include "vp8/common/pragmas.h"

-#include "vp8/common/postproc.h"

 #include "vp8/decoder/dequantize.h"

 #include "vp8/decoder/onyxd_int.h"

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -9,6 +9,7 @@

*/

+#include "vpx_config.h"

 #include "vp8/common/onyxc_int.h"

 #include "onyx_int.h"

 #include "vp8/common/systemdependent.h"

@@ -24,7 +25,9 @@

 #include "segmentation.h"

 #include "vp8/common/g_common.h"

 #include "vpx_scale/yv12extend.h"

+#if CONFIG_POSTPROC

 #include "vp8/common/postproc.h"

+#endif

 #include "vpx_mem/vpx_mem.h"

 #include "vp8/common/swapyv12buffer.h"

 #include "vp8/common/threading.h"

@@ -2660,6 +2663,8 @@

         cpi->Source = &cpi->scaled_source;

 #endif

+    else

+        cpi->Source = sd;

--- a/vp8/encoder/temporal_filter.c

+++ b/vp8/encoder/temporal_filter.c

@@ -24,7 +24,6 @@

 #include "segmentation.h"

 #include "vp8/common/g_common.h"

 #include "vpx_scale/yv12extend.h"

-#include "vp8/common/postproc.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vp8/common/swapyv12buffer.h"

 #include "vp8/common/threading.h"

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -15,6 +15,7 @@

 VP8_COMMON_SRCS-yes += common/onyx.h

 VP8_COMMON_SRCS-yes += common/onyxd.h

 VP8_COMMON_SRCS-yes += common/alloccommon.c

+VP8_COMMON_SRCS-yes += common/asm_com_offsets.c

 VP8_COMMON_SRCS-yes += common/blockd.c

 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h

 VP8_COMMON_SRCS-yes += common/debugmodes.c

@@ -101,14 +102,16 @@

 endif

 # common (c)

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/asm_com_offsets.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h

 # common (armv6)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)

--- a/vp8/vp8cx_arm.mk

+++ b/vp8/vp8cx_arm.mk

@@ -15,9 +15,12 @@

 # encoder

 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/picklpf_arm.c

 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/picklpf_arm.c

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h

 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c

 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.h

--- a/vp8/vp8dx.mk

+++ b/vp8/vp8dx.mk

@@ -48,6 +48,7 @@

 #INCLUDES += common

 #INCLUDES += decoder

+VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c

 VP8_DX_SRCS-yes += decoder/dboolhuff.c

 VP8_DX_SRCS-yes += decoder/decodemv.c

 VP8_DX_SRCS-yes += decoder/decodframe.c

--- a/vp8/vp8dx_arm.mk

+++ b/vp8/vp8dx_arm.mk

@@ -12,9 +12,8 @@

 #VP8_DX_SRCS list is modified according to different platforms.

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c

-VP8_CX_SRCS-$(ARCH_ARM)  += decoder/asm_dec_offsets.c

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c

+VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h

 #File list for armv6

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)

--- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm

+++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm

@@ -18,42 +18,32 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);

-;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height

-; are always multiples of 16.

+; we depend on VP8BORDERINPIXELS being 32

 |vp8_yv12_extend_frame_borders_neon| PROC

     push            {r4 - r10, lr}

     vpush           {d8 - d15}

-    ;Not need to load y_width, since: y_width = y_stride - 2*border

-    ldr             r3, [r0, #yv12_buffer_config_border]

-    ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1

-    ldr             r4, [r0, #yv12_buffer_config_y_height]

-    ldr             lr, [r0, #yv12_buffer_config_y_stride]

+    ; Border = 32

+    ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width

+    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1

+    ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height

+    ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride

-    cmp             r3, #16

-    beq             b16_extend_frame_borders

+; Border copy for Y plane

+; copy the left and right most columns out

+    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)

+    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1

+    sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border

-;=======================

-b32_extend_frame_borders

-;border = 32

-;=======================

-;Border copy for Y plane

-;copy the left and right most columns out

-    sub             r5, r1, r3              ;destptr1

-    add             r6, r1, lr

-    sub             r6, r6, r3, lsl #1      ;destptr2

-    sub             r2, r6, #1              ;srcptr2

+    mov             r12, r4, lsr #2         ; plane_height / 4

-    ;Do four rows at one time

-    mov             r12, r4, lsr #2

 copy_left_right_y

     vld1.8          {d0[], d1[]}, [r1], lr

     vld1.8          {d4[], d5[]}, [r2], lr

     vld1.8          {d8[], d9[]}, [r1], lr

     vld1.8          {d12[], d13[]}, [r2], lr

-    vld1.8          {d16[], d17[]},  [r1], lr

+    vld1.8          {d16[], d17[]}, [r1], lr

     vld1.8          {d20[], d21[]}, [r2], lr

     vld1.8          {d24[], d25[]}, [r1], lr

     vld1.8          {d28[], d29[]}, [r2], lr

@@ -81,15 +71,16 @@

     bne             copy_left_right_y

 ;Now copy the top and bottom source lines into each line of the respective borders

-    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1

-    mul             r8, r3, lr

+    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer

+    mul             r8, r4, lr              ; plane_height * plane_stride

-    mov             r12, lr, lsr #7

+    ; copy width is plane_stride

+    mov             r12, lr, lsr #7         ; plane_stride / 128

-    sub             r6, r1, r3              ;destptr2

-    sub             r2, r6, lr              ;srcptr2

-    sub             r1, r7, r3              ;srcptr1

-    sub             r5, r1, r8              ;destptr1

+    sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border

+    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))

+    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride

+    sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)

 copy_top_bottom_y

     vld1.8          {q0, q1}, [r1]!

@@ -101,7 +92,7 @@

     vld1.8          {q6, q7}, [r1]!

     vld1.8          {q14, q15}, [r2]!

-    mov             r7, r3

+    mov             r7, #32                 ; Border

 top_bottom_32

     subs            r7, r7, #1

@@ -115,45 +106,42 @@

     vst1.8          {q6, q7}, [r5]!

     vst1.8          {q14, q15}, [r6]!

-    add             r5, r5, lr

-    sub             r5, r5, #128

-    add             r6, r6, lr

-    sub             r6, r6, #128

+    add             r5, r5, lr              ; dest_ptr1 += plane_stride

+    sub             r5, r5, #128            ; dest_ptr1 -= 128

+    add             r6, r6, lr              ; dest_ptr2 += plane_stride

+    sub             r6, r6, #128            ; dest_ptr2 -= 128

     bne             top_bottom_32

-    sub             r5, r1, r8

-    add             r6, r2, lr

+    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)

+    add             r6, r2, lr              ; src_ptr2 + plane_stride

     subs            r12, r12, #1

     bne             copy_top_bottom_y

-    mov             r7, lr, lsr #4              ;check to see if extra copy is needed

+    mov             r7, lr, lsr #4          ; check to see if extra copy is needed

     ands            r7, r7, #0x7

     bne             extra_top_bottom_y

 end_of_border_copy_y

 ;Border copy for U, V planes

-    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1

-    mov             lr, lr, lsr #1              ;uv_stride

-    mov             r3, r3, lsr #1              ;border

-    mov             r4, r4, lsr #1              ;uv_height

-    mov             r8, r8, lsr #2

+; Border = 16

+    ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1

+    ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride

+    ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width

+    ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height

     mov             r10, #2

 ;copy the left and right most columns out

 border_copy_uv

-    sub             r5, r1, r3              ;destptr1

-    add             r6, r1, lr

-    sub             r6, r6, r3, lsl #1      ;destptr2

-    sub             r2, r6, #1              ;srcptr2

+    mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop

+    sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border

+    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)

+    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1

-    mov             r7, r1

+    mov             r12, r4, lsr #3         ; plane_height / 8

-    ;Do eight rows at one time

-    mov             r12, r4, lsr #3

 copy_left_right_uv

     vld1.8          {d0[], d1[]}, [r1], lr

     vld1.8          {d2[], d3[]}, [r2], lr

@@ -167,7 +155,7 @@

     vld1.8          {d18[], d19[]}, [r2], lr

     vld1.8          {d20[], d21[]}, [r1], lr

     vld1.8          {d22[], d23[]}, [r2], lr

-    vld1.8          {d24[], d25[]},  [r1], lr

+    vld1.8          {d24[], d25[]}, [r1], lr

     vld1.8          {d26[], d27[]}, [r2], lr

     vld1.8          {d28[], d29[]}, [r1], lr

     vld1.8          {d30[], d31[]}, [r2], lr

@@ -194,12 +182,14 @@

     bne             copy_left_right_uv

 ;Now copy the top and bottom source lines into each line of the respective borders

-    mov             r12, lr, lsr #6

+    mov             r1, r7

+    mul             r8, r4, lr              ; plane_height * plane_stride

+    mov             r12, lr, lsr #6         ; plane_stride / 64

-    sub             r6, r1, r3              ;destptr2

-    sub             r2, r6, lr              ;srcptr2

-    sub             r1, r7, r3              ;srcptr1

-    sub             r5, r1, r8              ;destptr1

+    sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border

+    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)

+    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride

+    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)

 copy_top_bottom_uv

     vld1.8          {q0, q1}, [r1]!

@@ -207,7 +197,7 @@

     vld1.8          {q2, q3}, [r1]!

     vld1.8          {q10, q11}, [r2]!

-    mov             r7, r3

+    mov             r7, #16                 ; Border

 top_bottom_16

     subs            r7, r7, #1

@@ -217,26 +207,26 @@

     vst1.8          {q2, q3}, [r5]!

     vst1.8          {q10, q11}, [r6]!

-    add             r5, r5, lr

+    add             r5, r5, lr              ; dest_ptr1 += plane_stride

     sub             r5, r5, #64

-    add             r6, r6, lr

+    add             r6, r6, lr              ; dest_ptr2 += plane_stride

     sub             r6, r6, #64

     bne             top_bottom_16

-    sub             r5, r1, r8

-    add             r6, r2, lr

+    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)

+    add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride

     subs            r12, r12, #1

     bne             copy_top_bottom_uv

-    mov             r7, lr, lsr #3              ;check to see if extra copy is needed

+    mov             r7, lr, lsr #3          ; check to see if extra copy is needed

     ands            r7, r7, #0x7

     bne             extra_top_bottom_uv

 end_of_border_copy_uv

     subs            r10, r10, #1

-    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1

+    ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1

     bne             border_copy_uv

     vpop            {d8 - d15}

@@ -243,12 +233,11 @@

     pop             {r4 - r10, pc}

 ;;;;;;;;;;;;;;;;;;;;;;

-;extra copy part for Y

 extra_top_bottom_y

     vld1.8          {q0}, [r1]!

     vld1.8          {q2}, [r2]!

-    mov             r9, r3, lsr #3

+    mov             r9, #4                  ; 32 >> 3

 extra_top_bottom_32

     subs            r9, r9, #1

@@ -271,19 +260,18 @@

     vst1.8          {q2}, [r6], lr

     bne             extra_top_bottom_32

-    sub             r5, r1, r8

-    add             r6, r2, lr

+    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)

+    add             r6, r2, lr              ; src_ptr2 + plane_stride

     subs            r7, r7, #1

     bne             extra_top_bottom_y

     b               end_of_border_copy_y

-;extra copy part for UV

 extra_top_bottom_uv

     vld1.8          {d0}, [r1]!

     vld1.8          {d8}, [r2]!

-    mov             r9, r3, lsr #3

+    mov             r9, #2                  ; 16 >> 3

 extra_top_bottom_16

     subs            r9, r9, #1

@@ -306,283 +294,12 @@

     vst1.8          {d8}, [r6], lr

     bne             extra_top_bottom_16

-    sub             r5, r1, r8

-    add             r6, r2, lr

+    sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)

+    add             r6, r2, lr              ; src_ptr2 + plane_stride

     subs            r7, r7, #1

     bne             extra_top_bottom_uv

     b               end_of_border_copy_uv

-;=======================

-b16_extend_frame_borders

-;border = 16

-;=======================

-;Border copy for Y plane

-;copy the left and right most columns out

-    sub             r5, r1, r3              ;destptr1

-    add             r6, r1, lr

-    sub             r6, r6, r3, lsl #1      ;destptr2

-    sub             r2, r6, #1              ;srcptr2

-    ;Do four rows at one time

-    mov             r12, r4, lsr #2

-copy_left_right_y_b16

-    vld1.8          {d0[], d1[]}, [r1], lr

-    vld1.8          {d4[], d5[]}, [r2], lr

-    vld1.8          {d8[], d9[]}, [r1], lr

-    vld1.8          {d12[], d13[]}, [r2], lr

-    vld1.8          {d16[], d17[]},  [r1], lr

-    vld1.8          {d20[], d21[]}, [r2], lr

-    vld1.8          {d24[], d25[]}, [r1], lr

-    vld1.8          {d28[], d29[]}, [r2], lr

-    subs            r12, r12, #1

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q4}, [r5], lr

-    vst1.8          {q6}, [r6], lr

-    vst1.8          {q8}, [r5], lr

-    vst1.8          {q10}, [r6], lr

-    vst1.8          {q12}, [r5], lr

-    vst1.8          {q14}, [r6], lr

-    bne             copy_left_right_y_b16

-;Now copy the top and bottom source lines into each line of the respective borders

-    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1

-    mul             r8, r3, lr

-    mov             r12, lr, lsr #7

-    sub             r6, r1, r3              ;destptr2

-    sub             r2, r6, lr              ;srcptr2

-    sub             r1, r7, r3              ;srcptr1

-    sub             r5, r1, r8              ;destptr1

-copy_top_bottom_y_b16

-    vld1.8          {q0, q1}, [r1]!

-    vld1.8          {q8, q9}, [r2]!

-    vld1.8          {q2, q3}, [r1]!

-    vld1.8          {q10, q11}, [r2]!

-    vld1.8          {q4, q5}, [r1]!

-    vld1.8          {q12, q13}, [r2]!

-    vld1.8          {q6, q7}, [r1]!

-    vld1.8          {q14, q15}, [r2]!

-    mov             r7, r3

-top_bottom_16_b16

-    subs            r7, r7, #1

-    vst1.8          {q0, q1}, [r5]!

-    vst1.8          {q8, q9}, [r6]!

-    vst1.8          {q2, q3}, [r5]!

-    vst1.8          {q10, q11}, [r6]!

-    vst1.8          {q4, q5}, [r5]!

-    vst1.8          {q12, q13}, [r6]!

-    vst1.8          {q6, q7}, [r5]!

-    vst1.8          {q14, q15}, [r6]!

-    add             r5, r5, lr

-    sub             r5, r5, #128

-    add             r6, r6, lr

-    sub             r6, r6, #128

-    bne             top_bottom_16_b16

-    sub             r5, r1, r8

-    add             r6, r2, lr

-    subs            r12, r12, #1

-    bne             copy_top_bottom_y_b16

-    mov             r7, lr, lsr #4              ;check to see if extra copy is needed

-    ands            r7, r7, #0x7

-    bne             extra_top_bottom_y_b16

-end_of_border_copy_y_b16

-;Border copy for U, V planes

-    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1

-    mov             lr, lr, lsr #1              ;uv_stride

-    mov             r3, r3, lsr #1              ;border

-    mov             r4, r4, lsr #1              ;uv_height

-    mov             r8, r8, lsr #2

-    mov             r10, #2

-;copy the left and right most columns out

-border_copy_uv_b16

-    sub             r5, r1, r3              ;destptr1

-    add             r6, r1, lr

-    sub             r6, r6, r3, lsl #1      ;destptr2

-    sub             r2, r6, #1              ;srcptr2

-    mov             r7, r1

-    ;Do eight rows at one time

-    mov             r12, r4, lsr #3

-copy_left_right_uv_b16

-    vld1.8          {d0[]}, [r1], lr

-    vld1.8          {d2[]}, [r2], lr

-    vld1.8          {d4[]}, [r1], lr

-    vld1.8          {d6[]}, [r2], lr

-    vld1.8          {d8[]},  [r1], lr

-    vld1.8          {d10[]}, [r2], lr

-    vld1.8          {d12[]}, [r1], lr

-    vld1.8          {d14[]}, [r2], lr

-    vld1.8          {d16[]}, [r1], lr

-    vld1.8          {d18[]}, [r2], lr

-    vld1.8          {d20[]}, [r1], lr

-    vld1.8          {d22[]}, [r2], lr

-    vld1.8          {d24[]},  [r1], lr

-    vld1.8          {d26[]}, [r2], lr

-    vld1.8          {d28[]}, [r1], lr

-    vld1.8          {d30[]}, [r2], lr

-    subs            r12, r12, #1

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d2}, [r6], lr

-    vst1.8          {d4}, [r5], lr

-    vst1.8          {d6}, [r6], lr

-    vst1.8          {d8}, [r5], lr

-    vst1.8          {d10}, [r6], lr

-    vst1.8          {d12}, [r5], lr

-    vst1.8          {d14}, [r6], lr

-    vst1.8          {d16}, [r5], lr

-    vst1.8          {d18}, [r6], lr

-    vst1.8          {d20}, [r5], lr

-    vst1.8          {d22}, [r6], lr

-    vst1.8          {d24}, [r5], lr

-    vst1.8          {d26}, [r6], lr

-    vst1.8          {d28}, [r5], lr

-    vst1.8          {d30}, [r6], lr

-    bne             copy_left_right_uv_b16

-;Now copy the top and bottom source lines into each line of the respective borders

-    mov             r12, lr, lsr #6

-    sub             r6, r1, r3              ;destptr2

-    sub             r2, r6, lr              ;srcptr2

-    sub             r1, r7, r3              ;srcptr1

-    sub             r5, r1, r8              ;destptr1

-copy_top_bottom_uv_b16

-    vld1.8          {q0, q1}, [r1]!

-    vld1.8          {q8, q9}, [r2]!

-    vld1.8          {q2, q3}, [r1]!

-    vld1.8          {q10, q11}, [r2]!

-    mov             r7, r3

-top_bottom_8_b16

-    subs            r7, r7, #1

-    vst1.8          {q0, q1}, [r5]!

-    vst1.8          {q8, q9}, [r6]!

-    vst1.8          {q2, q3}, [r5]!

-    vst1.8          {q10, q11}, [r6]!

-    add             r5, r5, lr

-    sub             r5, r5, #64

-    add             r6, r6, lr

-    sub             r6, r6, #64

-    bne             top_bottom_8_b16

-    sub             r5, r1, r8

-    add             r6, r2, lr

-    subs            r12, r12, #1

-    bne             copy_top_bottom_uv_b16

-    mov             r7, lr, lsr #3              ;check to see if extra copy is needed

-    ands            r7, r7, #0x7

-    bne             extra_top_bottom_uv_b16

-end_of_border_copy_uv_b16

-    subs            r10, r10, #1

-    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1

-    bne             border_copy_uv_b16

-    vpop            {d8-d15}

-    pop             {r4 - r10, pc}

-;;;;;;;;;;;;;;;;;;;;;;

-;extra copy part for Y

-extra_top_bottom_y_b16

-    vld1.8          {q0}, [r1]!

-    vld1.8          {q2}, [r2]!

-    mov             r9, r3, lsr #3

-extra_top_bottom_16_b16

-    subs            r9, r9, #1

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    vst1.8          {q0}, [r5], lr

-    vst1.8          {q2}, [r6], lr

-    bne             extra_top_bottom_16_b16

-    sub             r5, r1, r8

-    add             r6, r2, lr

-    subs            r7, r7, #1

-    bne             extra_top_bottom_y_b16

-    b               end_of_border_copy_y_b16

-;extra copy part for UV

-extra_top_bottom_uv_b16

-    vld1.8          {d0}, [r1]!

-    vld1.8          {d8}, [r2]!

-    mov             r9, r3, lsr #3

-extra_top_bottom_8_b16

-    subs            r9, r9, #1

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    vst1.8          {d0}, [r5], lr

-    vst1.8          {d8}, [r6], lr

-    bne             extra_top_bottom_8_b16

-    sub             r5, r1, r8

-    add             r6, r2, lr

-    subs            r7, r7, #1

-    bne             extra_top_bottom_uv_b16

-    b               end_of_border_copy_uv_b16

     ENDP

END

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -971,7 +971,7 @@

 static const arg_def_t stereo_mode      = ARG_DEF_ENUM(NULL, "stereo-mode", 1,

         "Stereo 3D video format", stereo_mode_enum);

 static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,

-        "Stream timebase (see below)");

+        "Output timestamp precision (fractional seconds)");

 static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,

         "Enable error resiliency features");

 static const arg_def_t lag_in_frames    = ARG_DEF(NULL, "lag-in-frames", 1,

@@ -1127,7 +1127,7 @@

     arg_show_usage(stdout, vp8_args);

 #endif

     fprintf(stderr, "\nStream timebase (--timebase):\n"

-            "  This is the unit of time used to represent frame timestamps,\n"

+            "  The desired precision of timestamps in the output, expressed\n"

             "  in fractional seconds. Default is 1/1000.\n");

     fprintf(stderr, "\n"

            "Included encoders:\n"

--

⑨