shithub: libvpx

--- a/build/make/obj_int_extract.c

+++ b/build/make/obj_int_extract.c

@@ -9,26 +9,14 @@

*/

+#include <stdarg.h>

 #include <stdio.h>

 #include <stdlib.h>

+#include <string.h>

 #include "vpx_config.h"

-#if defined(_MSC_VER) || defined(__MINGW32__)

-#include <io.h>

-#include <share.h>

 #include "vpx/vpx_integer.h"

-#else

-#include <stdint.h>

-#include <unistd.h>

-#endif

-#include <string.h>

-#include <sys/types.h>

-#include <sys/stat.h>

-#include <fcntl.h>

-#include <stdarg.h>

 typedef enum

     OUTPUT_FMT_PLAIN,

@@ -47,7 +35,6 @@

 #if defined(__GNUC__) && __GNUC__

 #if defined(__MACH__)

 #include <mach-o/loader.h>

@@ -225,73 +212,6 @@

-int main(int argc, char **argv)

-{

-    int fd;

-    char *f;

-    struct stat stat_buf;

-    uint8_t *file_buf;

-    int res;

-    if (argc < 2 || argc > 3)

-    {

-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);

-        fprintf(stderr, "  <obj file>\tMachO format object file to parse\n");

-        fprintf(stderr, "Output Formats:\n");

-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");

-        fprintf(stderr, "  rvds - compatible with armasm\n");

-        goto bail;

-    }

-    f = argv[2];

-    if (!((!strcmp(argv[1], "rvds")) || (!strcmp(argv[1], "gas"))))

-        f = argv[1];

-    fd = open(f, O_RDONLY);

-    if (fd < 0)

-    {

-        perror("Unable to open file");

-        goto bail;

-    }

-    if (fstat(fd, &stat_buf))

-    {

-        perror("stat");

-        goto bail;

-    }

-    file_buf = malloc(stat_buf.st_size);

-    if (!file_buf)

-    {

-        perror("malloc");

-        goto bail;

-    }

-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)

-    {

-        perror("read");

-        goto bail;

-    }

-    if (close(fd))

-    {

-        perror("close");

-        goto bail;

-    }

-    res = parse_macho(file_buf, stat_buf.st_size);

-    free(file_buf);

-    if (!res)

-        return EXIT_SUCCESS;

-bail:

-    return EXIT_FAILURE;

-}

 #elif defined(__ELF__)

 #include "elf.h"

@@ -740,83 +660,11 @@

     return 1;

-int main(int argc, char **argv)

-{

-    int fd;

-    output_fmt_t mode;

-    char *f;

-    struct stat stat_buf;

-    uint8_t *file_buf;

-    int res;

-    if (argc < 2 || argc > 3)

-    {

-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);

-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");

-        fprintf(stderr, "Output Formats:\n");

-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");

-        fprintf(stderr, "  rvds - compatible with armasm\n");

-        goto bail;

-    }

-    f = argv[2];

-    if (!strcmp(argv[1], "rvds"))

-        mode = OUTPUT_FMT_RVDS;

-    else if (!strcmp(argv[1], "gas"))

-        mode = OUTPUT_FMT_GAS;

-    else

-        f = argv[1];

-    fd = open(f, O_RDONLY);

-    if (fd < 0)

-    {

-        perror("Unable to open file");

-        goto bail;

-    }

-    if (fstat(fd, &stat_buf))

-    {

-        perror("stat");

-        goto bail;

-    }

-    file_buf = malloc(stat_buf.st_size);

-    if (!file_buf)

-    {

-        perror("malloc");

-        goto bail;

-    }

-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)

-    {

-        perror("read");

-        goto bail;

-    }

-    if (close(fd))

-    {

-        perror("close");

-        goto bail;

-    }

-    res = parse_elf(file_buf, stat_buf.st_size, mode);

-    free(file_buf);

-    if (!res)

-        return EXIT_SUCCESS;

-bail:

-    return EXIT_FAILURE;

-}

 #endif

-#endif

+#endif /* defined(__GNUC__) && __GNUC__ */

-#if defined(_MSC_VER) || defined(__MINGW32__)

+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)

 /*  See "Microsoft Portable Executable and Common Object File Format Specification"

     for reference.

*/

@@ -823,13 +671,13 @@

 #define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )

 #define get_le16(x) ((*(x)) | (*(x+1)) << 8)

-int parse_coff(unsigned __int8 *buf, size_t sz)

+int parse_coff(uint8_t *buf, size_t sz)

     unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;

     unsigned int sectionrawdata_ptr;

     unsigned int i;

-    unsigned __int8 *ptr;

-    unsigned __int32 symoffset;

+    uint8_t *ptr;

+    uint32_t symoffset;

     char **sectionlist;  //this array holds all section names in their correct order.

     //it is used to check if the symbol is in .bss or .data section.

@@ -907,7 +755,7 @@

     for (i = 0; i < symtab_sz; i++)

-        __int16 section = get_le16(ptr + 12); //section number

+        int16_t section = get_le16(ptr + 12); //section number

         if (section > 0 && ptr[16] == 2)

@@ -978,20 +826,21 @@

     return 1;

+#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */

 int main(int argc, char **argv)

-    int fd;

-    output_fmt_t mode;

+    output_fmt_t mode = OUTPUT_FMT_PLAIN;

     const char *f;

-    struct _stat stat_buf;

-    unsigned __int8 *file_buf;

+    uint8_t *file_buf;

     int res;

+    FILE *fp;

+    long int file_size;

     if (argc < 2 || argc > 3)

         fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);

-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");

+        fprintf(stderr, "  <obj file>\tobject file to parse\n");

         fprintf(stderr, "Output Formats:\n");

         fprintf(stderr, "  gas  - compatible with GNU assembler\n");

         fprintf(stderr, "  rvds - compatible with armasm\n");

@@ -1007,15 +856,22 @@

     else

         f = argv[1];

-    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);

+    fp = fopen(f, "rb");

-    if (_fstat(fd, &stat_buf))

+    if (!fp)

+        perror("Unable to open file");

+        goto bail;

+    }

+    if (fseek(fp, 0, SEEK_END))

+    {

         perror("stat");

         goto bail;

-    file_buf = malloc(stat_buf.st_size);

+    file_size = ftell(fp);

+    file_buf = malloc(file_size);

     if (!file_buf)

@@ -1023,19 +879,30 @@

         goto bail;

-    if (_read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)

+    rewind(fp);

+    if (fread(file_buf, sizeof(char), file_size, fp) != file_size)

         perror("read");

         goto bail;

-    if (_close(fd))

+    if (fclose(fp))

         perror("close");

         goto bail;

-    res = parse_coff(file_buf, stat_buf.st_size);

+#if defined(__GNUC__) && __GNUC__

+#if defined(__MACH__)

+    res = parse_macho(file_buf, file_size);

+#elif defined(__ELF__)

+    res = parse_elf(file_buf, file_size, mode);

+#endif

+#endif

+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)

+    res = parse_coff(file_buf, file_size);

+#endif

     free(file_buf);

@@ -1045,4 +912,3 @@

 bail:

     return EXIT_FAILURE;

-#endif

--- a/vp8/common/entropymv.h

+++ b/vp8/common/entropymv.h

@@ -18,6 +18,8 @@

     mv_max  = 1023,              /* max absolute value of a MV component */

     MVvals = (2 * mv_max) + 1,   /* # possible values "" */

+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */

+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */

     mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */

     mvnum_short = 8,         /* magnitudes 0 through 7 */

--- a/vp8/decoder/detokenize.c

+++ b/vp8/decoder/detokenize.c

@@ -181,7 +181,7 @@

     ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;

     ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;

-    const VP8_COMMON *const oc = & dx->common;

+    const FRAME_CONTEXT * const fc = &dx->common.fc;

     BOOL_DECODER *bc = x->current_bc;

@@ -236,7 +236,7 @@

     range   = bc->range;

-    coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];

+    coef_probs = fc->coef_probs [type] [ 0 ] [0];

 BLOCK_LOOP:

     a = A + vp8_block2above[i];

@@ -348,7 +348,7 @@

         type = 0;

         i = 0;

         stop = 16;

-        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];

+        coef_probs = fc->coef_probs [type] [ 0 ] [0];

         qcoeff_ptr -= (24*16 + 16);

         goto BLOCK_LOOP;

@@ -356,7 +356,7 @@

     if (i == 16)

         type = 2;

-        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];

+        coef_probs = fc->coef_probs [type] [ 0 ] [0];

         stop = 24;

         goto BLOCK_LOOP;

--- a/vp8/encoder/block.h

+++ b/vp8/encoder/block.h

@@ -86,7 +86,7 @@

     int mvcosts[2][MVvals+1];

     int *mvcost[2];

-    int mvsadcosts[2][MVvals+1];

+    int mvsadcosts[2][MVfpvals+1];

     int *mvsadcost[2];

     int mbmode_cost[2][MB_MODE_COUNT];

     int intra_uv_mode_cost[2][MB_MODE_COUNT];

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -979,7 +979,7 @@

         if (flag[0] || flag[1])

-            vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);

+            vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);

 #endif

--- a/vp8/encoder/encodemv.c

+++ b/vp8/encoder/encodemv.c

@@ -134,10 +134,8 @@

     return cost;   // + vp8_cost_bit( p [MVPsign], v < 0);

-//#define M_LOG2_E 0.693147180559945309417

-//#define log2f(x) (log (x) / (float) M_LOG2_E)

-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])

+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])

     int i = 1;   //-mv_max;

     unsigned int cost0 = 0;

@@ -144,22 +142,7 @@

     unsigned int cost1 = 0;

     vp8_clear_system_state();

-#if 0

-    mvsadcost [0] [0] = 300;

-    mvsadcost [1] [0] = 300;

-    do

-    {

-        double z = 256 * (2 * (log2f(2 * i) + .6));

-        mvsadcost [0][i] = (int) z;

-        mvsadcost [1][i] = (int) z;

-        mvsadcost [0][-i] = (int) z;

-        mvsadcost [1][-i] = (int) z;

-    }

-    while (++i <= mv_max);

-#endif

     i = 1;

     if (mvc_flag[0])

@@ -193,16 +176,6 @@

         while (++i <= mv_max);

-    /*

-        i=-mv_max;

-        do

-        {

-            mvcost [0] [i] = cost_mvcomponent( i, mvc[0]);

-            mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);

-        }

-        while( ++i <= mv_max);

-    */

@@ -436,7 +409,7 @@

);

     if (flags[0] || flags[1])

-        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);

+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);

 #ifdef ENTROPY_STATS

     active_section = 5;

--- a/vp8/encoder/encodemv.h

+++ b/vp8/encoder/encodemv.h

@@ -16,6 +16,6 @@

 void vp8_write_mvprobs(VP8_COMP *);

 void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);

-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);

+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);

 #endif

--- a/vp8/encoder/ethreading.c

+++ b/vp8/encoder/ethreading.c

@@ -319,8 +319,8 @@

     vpx_memcpy(z->mvcosts,          x->mvcosts,         sizeof(x->mvcosts));

     z->mvcost[0] = &z->mvcosts[0][mv_max+1];

     z->mvcost[1] = &z->mvcosts[1][mv_max+1];

-    z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];

-    z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];

+    z->mvsadcost[0] = &z->mvsadcosts[0][mvfp_max+1];

+    z->mvsadcost[1] = &z->mvsadcosts[1][mvfp_max+1];

     vpx_memcpy(z->token_costs,       x->token_costs,      sizeof(x->token_costs));

--- a/vp8/encoder/firstpass.c

+++ b/vp8/encoder/firstpass.c

@@ -446,7 +446,7 @@

     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

     // Initial step/diamond search centred on best mv

-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);

+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);

     if ( tmp_err < INT_MAX-new_mv_mode_penalty )

         tmp_err += new_mv_mode_penalty;

@@ -469,7 +469,7 @@

             num00--;

         else

-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);

+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);

             if ( tmp_err < INT_MAX-new_mv_mode_penalty )

                 tmp_err += new_mv_mode_penalty;

@@ -540,7 +540,7 @@

         int flag[2] = {1, 1};

         vp8_initialize_rd_consts(cpi, cm->base_qindex+cm->y1dc_delta_q);

         vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));

-        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);

+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);

     // for each macroblock row in image

--- a/vp8/encoder/mcomp.c

+++ b/vp8/encoder/mcomp.c

@@ -54,6 +54,11 @@

     //return (vp8_mv_bit_cost(mv,  ref, mvcost, 128) * error_per_bit + 128) >> 8;

+static int mvsad_err_cost(MV *mv, MV *ref, int *mvsadcost[2], int error_per_bit)

+{

+    /* Calculate sad error cost on full pixel basis. */

+    return ((mvsadcost[0][(mv->row - ref->row)] + mvsadcost[1][(mv->col - ref->col)]) * error_per_bit + 128) >> 8;

+}

 static int mv_bits(MV *mv, MV *ref, int *mvcost[2])

@@ -753,7 +758,7 @@

-#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)

+#define MVC(r,c) (((mvsadcost[0][r-rr] + mvsadcost[1][c-rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)

 #define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector

 #define DIST(r,c,v) vfp->sdf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.

 #define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost

@@ -801,8 +806,8 @@

     if (br > x->mv_row_max) br = x->mv_row_max;

-    rr >>= 1;

-    rc >>= 1;

+    rr >>= 3;

+    rc >>= 3;

     besterr = ERR(br, bc, thiserr);

@@ -915,7 +920,6 @@

     int error_per_bit,

     int *num00,

     vp8_variance_fn_ptr_t *fn_ptr,

-    int *mvsadcost[2],

     int *mvcost[2],

     MV *center_mv

@@ -944,8 +948,16 @@

     unsigned char *check_here;

     int thissad;

+    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};

+    MV fcenter_mv;

+    fcenter_mv.row = center_mv->row >> 3;

+    fcenter_mv.col = center_mv->col >> 3;

     *num00 = 0;

+    best_mv->row = ref_row;

+    best_mv->col = ref_col;

     // Work out the start point for the search

     in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

     best_address = in_what;

@@ -955,7 +967,7 @@

     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))

         // Check the starting position

-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);

+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);

     // search_param determines the length of the initial step and hence the number of iterations

@@ -964,8 +976,6 @@

     tot_steps = (x->ss_count / x->searches_per_step) - search_param;

     i = 1;

-    best_mv->row = ref_row;

-    best_mv->col = ref_col;

     for (step = 0; step < tot_steps ; step++)

@@ -984,9 +994,9 @@

                 if (thissad < bestsad)

-                    this_mv.row = this_row_offset << 3;

-                    this_mv.col = this_col_offset << 3;

-                    thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                    this_mv.row = this_row_offset;

+                    this_mv.col = this_col_offset;

+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                     if (thissad < bestsad)

@@ -1031,7 +1041,6 @@

     int error_per_bit,

     int *num00,

     vp8_variance_fn_ptr_t *fn_ptr,

-    int *mvsadcost[2],

     int *mvcost[2],

     MV *center_mv

@@ -1060,7 +1069,14 @@

     unsigned char *check_here;

     unsigned int thissad;

+    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};

+    MV fcenter_mv;

+    fcenter_mv.row = center_mv->row >> 3;

+    fcenter_mv.col = center_mv->col >> 3;

     *num00 = 0;

+    best_mv->row = ref_row;

+    best_mv->col = ref_col;

     // Work out the start point for the search

     in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

@@ -1071,7 +1087,7 @@

     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))

         // Check the starting position

-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);

+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);

     // search_param determines the length of the initial step and hence the number of iterations

@@ -1080,8 +1096,6 @@

     tot_steps = (x->ss_count / x->searches_per_step) - search_param;

     i = 1;

-    best_mv->row = ref_row;

-    best_mv->col = ref_col;

     for (step = 0; step < tot_steps ; step++)

@@ -1111,9 +1125,9 @@

                     if (sad_array[t] < bestsad)

-                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;

-                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;

-                        sad_array[t] += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                        this_mv.row = best_mv->row + ss[i].mv.row;

+                        this_mv.col = best_mv->col + ss[i].mv.col;

+                        sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                         if (sad_array[t] < bestsad)

@@ -1140,9 +1154,9 @@

                     if (thissad < bestsad)

-                        this_mv.row = this_row_offset << 3;

-                        this_mv.col = this_col_offset << 3;

-                        thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                        this_mv.row = this_row_offset;

+                        this_mv.col = this_col_offset;

+                        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                         if (thissad < bestsad)

@@ -1178,7 +1192,7 @@

 #if !(CONFIG_REALTIME_ONLY)

-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)

+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)

     unsigned char *what = (*(b->base_src) + b->src);

     int what_stride = b->src_stride;

@@ -1202,6 +1216,11 @@

     int col_min = ref_col - distance;

     int col_max = ref_col + distance;

+    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};

+    MV fcenter_mv;

+    fcenter_mv.row = center_mv->row >> 3;

+    fcenter_mv.col = center_mv->col >> 3;

     // Work out the mid point for the search

     in_what = *(d->base_pre) + d->pre;

     bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

@@ -1216,7 +1235,7 @@

         // Baseline value at the centre

         //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));

-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);

+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);

     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

@@ -1234,7 +1253,7 @@

     for (r = row_min; r < row_max ; r++)

-        this_mv.row = r << 3;

+        this_mv.row = r;

         check_here = r * mv_stride + in_what + col_min;

         for (c = col_min; c < col_max; c++)

@@ -1241,10 +1260,10 @@

             thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);

-            this_mv.col = c << 3;

+            this_mv.col = c;

             //thissad += (int)sqrt(mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));

             //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];

-            thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

+            thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

             if (thissad < bestsad)

@@ -1268,7 +1287,7 @@

         return INT_MAX;

-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)

+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)

     unsigned char *what = (*(b->base_src) + b->src);

     int what_stride = b->src_stride;

@@ -1294,6 +1313,11 @@

     unsigned int sad_array[3];

+    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};

+    MV fcenter_mv;

+    fcenter_mv.row = center_mv->row >> 3;

+    fcenter_mv.col = center_mv->col >> 3;

     // Work out the mid point for the search

     in_what = *(d->base_pre) + d->pre;

     bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

@@ -1306,7 +1330,7 @@

     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))

         // Baseline value at the centre

-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);

+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);

     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

@@ -1324,7 +1348,7 @@

     for (r = row_min; r < row_max ; r++)

-        this_mv.row = r << 3;

+        this_mv.row = r;

         check_here = r * mv_stride + in_what + col_min;

         c = col_min;

@@ -1340,8 +1364,8 @@

                 if (thissad < bestsad)

-                    this_mv.col = c << 3;

-                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                    this_mv.col = c;

+                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                     if (thissad < bestsad)

@@ -1363,8 +1387,8 @@

             if (thissad < bestsad)

-                this_mv.col = c << 3;

-                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                this_mv.col = c;

+                thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                 if (thissad < bestsad)

@@ -1391,7 +1415,7 @@

         return INT_MAX;

-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)

+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)

     unsigned char *what = (*(b->base_src) + b->src);

     int what_stride = b->src_stride;

@@ -1418,6 +1442,11 @@

     DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);

     unsigned int sad_array[3];

+    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};

+    MV fcenter_mv;

+    fcenter_mv.row = center_mv->row >> 3;

+    fcenter_mv.col = center_mv->col >> 3;

     // Work out the mid point for the search

     in_what = *(d->base_pre) + d->pre;

     bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

@@ -1430,7 +1459,7 @@

     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))

         // Baseline value at the centre

-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);

+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);

     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

@@ -1448,7 +1477,7 @@

     for (r = row_min; r < row_max ; r++)

-        this_mv.row = r << 3;

+        this_mv.row = r;

         check_here = r * mv_stride + in_what + col_min;

         c = col_min;

@@ -1464,8 +1493,8 @@

                 if (thissad < bestsad)

-                    this_mv.col = c << 3;

-                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                    this_mv.col = c;

+                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                     if (thissad < bestsad)

@@ -1493,8 +1522,8 @@

                 if (thissad < bestsad)

-                    this_mv.col = c << 3;

-                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                    this_mv.col = c;

+                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                     if (thissad < bestsad)

@@ -1516,8 +1545,8 @@

             if (thissad < bestsad)

-                this_mv.col = c << 3;

-                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

+                this_mv.col = c;

+                thissad  += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);

                 if (thissad < bestsad)

--- a/vp8/encoder/mcomp.h

+++ b/vp8/encoder/mcomp.h

@@ -66,7 +66,6 @@

      int distance, \

      vp8_variance_fn_ptr_t *fn_ptr, \

      int *mvcost[2], \

-     int *mvsadcost[2], \

      MV *center_mv \

@@ -82,7 +81,6 @@

      int error_per_bit, \

      int *num00, \

      vp8_variance_fn_ptr_t *fn_ptr, \

-     int *mvsadcost[2], \

      int *mvcost[2], \

      MV *center_mv \

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -1866,13 +1866,13 @@

do

-        double z = 256 * (2 * (log2f(2 * i) + .6));

+        double z = 256 * (2 * (log2f(8 * i) + .6));

         mvsadcost [0][i] = (int) z;

         mvsadcost [1][i] = (int) z;

         mvsadcost [0][-i] = (int) z;

         mvsadcost [1][-i] = (int) z;

-    while (++i <= mv_max);

+    while (++i <= mvfp_max);

 VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)

@@ -2069,8 +2069,8 @@

     cpi->mb.mvcost[0] = &cpi->mb.mvcosts[0][mv_max+1];

     cpi->mb.mvcost[1] = &cpi->mb.mvcosts[1][mv_max+1];

-    cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mv_max+1];

-    cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mv_max+1];

+    cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mvfp_max+1];

+    cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mvfp_max+1];

     cal_mvsadcosts(cpi->mb.mvsadcost);

--- a/vp8/encoder/pickinter.c

+++ b/vp8/encoder/pickinter.c

@@ -738,7 +738,7 @@

             else

-                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9

+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb < 9

                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;

                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -757,7 +757,7 @@

                         num00--;

                     else

-                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9

+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb = 9

                         if (thissme < bestsme)

--- a/vp8/encoder/ratectrl.c

+++ b/vp8/encoder/ratectrl.c

@@ -355,7 +355,7 @@

     vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));

         int flag[2] = {1, 1};

-        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);

+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);

     vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc));  //initialize pre_mvc to all zero.

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -1224,7 +1224,7 @@

                         bestsme = cpi->diamond_search_sad(x, c, e, bsi->mvp,

                                                           &mode_mv[NEW4X4], step_param,

-                                                          sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);

+                                                          sadpb / 2, &num00, v_fn_ptr, x->mvcost, bsi->ref_mv);

                         n = num00;

                         num00 = 0;

@@ -1239,7 +1239,7 @@

                                 thissme = cpi->diamond_search_sad(x, c, e, bsi->mvp,

                                                                   &temp_mv, step_param + n,

-                                                                  sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);

+                                                                  sadpb / 2, &num00, v_fn_ptr, x->mvcost, bsi->ref_mv);

                                 if (thissme < bestsme)

@@ -1257,7 +1257,7 @@

                     if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)

                         thissme = cpi->full_search_sad(x, c, e, bsi->mvp,

-                                                       sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost,bsi->ref_mv);

+                                                       sadpb / 4, 16, v_fn_ptr, x->mvcost, bsi->ref_mv);

                         if (thissme < bestsme)

@@ -2167,7 +2167,7 @@

                     else

-                        bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9

+                        bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb < 9

                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;

                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -2186,7 +2186,7 @@

                                 num00--;

                             else

-                                thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9

+                                thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb = 9

                                 if (thissme < bestsme)

@@ -2232,7 +2232,7 @@

                         int sadpb = x->sadperbit16 >> 2;

-                        thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost,&best_ref_mv);

+                        thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv);

                     // Barrier threshold to initiating full search

--- a/vp8/encoder/temporal_filter.c

+++ b/vp8/encoder/temporal_filter.c

@@ -195,63 +195,14 @@

         further_steps = 0;

-    if (1/*cpi->sf.search_method == HEX*/)

-    {

-        // TODO Check that the 16x16 vf & sdf are selected here

-        bestsme = vp8_hex_search(x, b, d,

-            &best_ref_mv1, &d->bmi.mv.as_mv,

-            step_param,

-            sadpb/*x->errorperbit*/,

-            &num00, &cpi->fn_ptr[BLOCK_16X16],

-            mvsadcost, mvcost, &best_ref_mv1);

-    }

-    else

-    {

-        int mv_x, mv_y;

-        bestsme = cpi->diamond_search_sad(x, b, d,

-            &best_ref_mv1, &d->bmi.mv.as_mv,

-            step_param,

-            sadpb / 2/*x->errorperbit*/,

-            &num00, &cpi->fn_ptr[BLOCK_16X16],

-            mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9

-        // Further step/diamond searches as necessary

-        n = 0;

-        //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

-        n = num00;

-        num00 = 0;

-        while (n < further_steps)

-        {

-            n++;

-            if (num00)

-                num00--;

-            else

-            {

-                thissme = cpi->diamond_search_sad(x, b, d,

-                    &best_ref_mv1, &d->bmi.mv.as_mv,

-                    step_param + n,

-                    sadpb / 4/*x->errorperbit*/,

-                    &num00, &cpi->fn_ptr[BLOCK_16X16],

-                    mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9

-                if (thissme < bestsme)

-                {

-                    bestsme = thissme;

-                    mv_y = d->bmi.mv.as_mv.row;

-                    mv_x = d->bmi.mv.as_mv.col;

-                }

-                else

-                {

-                    d->bmi.mv.as_mv.row = mv_y;

-                    d->bmi.mv.as_mv.col = mv_x;

-                }

-            }

-        }

-    }

+    /*cpi->sf.search_method == HEX*/

+    // TODO Check that the 16x16 vf & sdf are selected here

+    bestsme = vp8_hex_search(x, b, d,

+        &best_ref_mv1, &d->bmi.mv.as_mv,

+        step_param,

+        sadpb/*x->errorperbit*/,

+        &num00, &cpi->fn_ptr[BLOCK_16X16],

+        mvsadcost, mvcost, &best_ref_mv1);

 #if ALT_REF_SUBPEL_ENABLED

     // Try sub-pixel MC?

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ b/vp8/encoder/x86/quantize_sse2.asm

@@ -22,35 +22,36 @@

     mov         rbp, rsp

     SAVE_XMM

     GET_GOT     rbx

-    push        rsi

 %if ABI_IS_32BIT

     push        rdi

+    push        rsi

 %else

   %ifidn __OUTPUT_FORMAT__,x64

     push        rdi

+    push        rsi

   %endif

 %endif

     ALIGN_STACK 16, rax

-    %define BLOCKD_d          0  ;  8

-    %define zrun_zbin_boost   8  ;  8

-    %define abs_minus_zbin    16 ; 32

-    %define temp_qcoeff       48 ; 32

-    %define qcoeff            80 ; 32

-    %define stack_size        112

+    %define zrun_zbin_boost   0  ;  8

+    %define abs_minus_zbin    8  ; 32

+    %define temp_qcoeff       40 ; 32

+    %define qcoeff            72 ; 32

+    %define stack_size        104

     sub         rsp, stack_size

     ; end prolog

 %if ABI_IS_32BIT

-    mov         rdi, arg(0)

+    mov         rdi, arg(0)                 ; BLOCK *b

+    mov         rsi, arg(1)                 ; BLOCKD *d

 %else

   %ifidn __OUTPUT_FORMAT__,x64

     mov         rdi, rcx                    ; BLOCK *b

-    mov         [rsp + BLOCKD_d], rdx

+    mov         rsi, rdx                    ; BLOCKD *d

   %else

     ;mov         rdi, rdi                    ; BLOCK *b

-    mov         [rsp + BLOCKD_d], rsi

+    ;mov         rsi, rsi                    ; BLOCKD *d

   %endif

 %endif

@@ -125,45 +126,44 @@

     movdqa      [rsp + qcoeff], xmm6

     movdqa      [rsp + qcoeff + 16], xmm6

-    mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr

+    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr

     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr

-    mov         [rsp + zrun_zbin_boost], rsi

+    mov         [rsp + zrun_zbin_boost], rdx

 %macro ZIGZAG_LOOP 1

-    movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc

; x

-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]

+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

     ; if (x >= zbin)

-    sub         cx, WORD PTR[rsi]           ; x - zbin

-    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++

+    sub         cx, WORD PTR[rdx]           ; x - zbin

+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

     jl          rq_zigzag_loop_%1           ; x < zbin

-    movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2]

+    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

-    ; downshift by quant_shift[rdx]

-    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]

+    ; downshift by quant_shift[rc]

+    movsx       ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc]

     sar         edi, cl                     ; also sets Z bit

     je          rq_zigzag_loop_%1           ; !y

-    mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

+    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

+    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

 rq_zigzag_loop_%1:

 %endmacro

-ZIGZAG_LOOP 0

-ZIGZAG_LOOP 1

-ZIGZAG_LOOP 2

-ZIGZAG_LOOP 3

-ZIGZAG_LOOP 4

-ZIGZAG_LOOP 5

-ZIGZAG_LOOP 6

-ZIGZAG_LOOP 7

-ZIGZAG_LOOP 8

-ZIGZAG_LOOP 9

-ZIGZAG_LOOP 10

-ZIGZAG_LOOP 11

+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c

+ZIGZAG_LOOP  0

+ZIGZAG_LOOP  1

+ZIGZAG_LOOP  4

+ZIGZAG_LOOP  8

+ZIGZAG_LOOP  5

+ZIGZAG_LOOP  2

+ZIGZAG_LOOP  3

+ZIGZAG_LOOP  6

+ZIGZAG_LOOP  9

 ZIGZAG_LOOP 12

 ZIGZAG_LOOP 13

+ZIGZAG_LOOP 10

+ZIGZAG_LOOP  7

+ZIGZAG_LOOP 11

 ZIGZAG_LOOP 14

 ZIGZAG_LOOP 15

@@ -170,15 +170,9 @@

     movdqa      xmm2, [rsp + qcoeff]

     movdqa      xmm3, [rsp + qcoeff + 16]

-%if ABI_IS_32BIT

-    mov         rdi, arg(1)

-%else

-    mov         rdi, [rsp + BLOCKD_d]

-%endif

+    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr

+    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr

-    mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr

-    mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr

     ; y ^ sz

     pxor        xmm2, xmm0

     pxor        xmm3, xmm4

@@ -190,7 +184,7 @@

     movdqa      xmm0, [rcx]

     movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr

+    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr

     pmullw      xmm0, xmm2

     pmullw      xmm1, xmm3

@@ -197,8 +191,8 @@

     movdqa      [rcx], xmm2        ; store qcoeff

     movdqa      [rcx + 16], xmm3

-    movdqa      [rsi], xmm0        ; store dqcoeff

-    movdqa      [rsi + 16], xmm1

+    movdqa      [rdi], xmm0        ; store dqcoeff

+    movdqa      [rdi + 16], xmm1

     ; select the last value (in zig_zag order) for EOB

     pcmpeqw     xmm2, xmm6

@@ -220,19 +214,20 @@

     pmaxsw      xmm2, xmm3

     movd        eax, xmm2

     and         eax, 0xff

-    mov         [rdi + vp8_blockd_eob], eax

+    mov         [rsi + vp8_blockd_eob], eax

     ; begin epilog

     add         rsp, stack_size

     pop         rsp

 %if ABI_IS_32BIT

+    pop         rsi

     pop         rdi

 %else

   %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

     pop         rdi

   %endif

 %endif

-    pop         rsi

     RESTORE_GOT

     RESTORE_XMM

     pop         rbp

@@ -347,11 +342,6 @@

 SECTION_RODATA

 align 16

-zig_zag:

-  dw 0x0000, 0x0001, 0x0004, 0x0008

-  dw 0x0005, 0x0002, 0x0003, 0x0006

-  dw 0x0009, 0x000c, 0x000d, 0x000a

-  dw 0x0007, 0x000b, 0x000e, 0x000f

 inv_zig_zag:

   dw 0x0001, 0x0002, 0x0006, 0x0007

   dw 0x0003, 0x0005, 0x0008, 0x000d

--

⑨