ref: 71e5eaf30c76e8bbd82fe4c9592d4406d7fa1c08
author: Sigrid Solveig Haflínudóttir <ftrvxmtrx@gmail.com>
date: Tue Jul 13 08:08:57 EDT 2021
THAT'S A FIRST
--- /dev/null
+++ b/LICENSE
@@ -1,0 +1,1 @@
+Public domain.
--- /dev/null
+++ b/README.md
@@ -1,0 +1,3 @@
+# hj264
+
+H.264 encoder for Plan 9. WIP.
--- /dev/null
+++ b/hj264.c
@@ -1,0 +1,372 @@
+#define MINIH264_IMPLEMENTATION
+#define H264E_MAX_THREADS 7
+#include "minih264e.h"
+#include <thread.h>
+#include <bio.h>
+#include <draw.h>
+#include <memdraw.h>
+#include <tos.h>
+
+#define max(a,b) ((a)>(b)?(a):(b))
+#define min(a,b) ((a)<(b)?(a):(b))
+#define clp(v,a,b) min((b), max((v),(a)))
+#define align(p,a) (void*)((((uintptr)p - 1) | (a-1)) + 1)
+
+enum {
+ Align = 64,
+ Maxquality = 10,
+ Gop = 20,
+};
+
+typedef struct Hjob Hjob;
+typedef struct Hjthread Hjthread;
+typedef struct Hj264 Hj264;
+
+struct Hjob {
+ void (*run)(void *);
+ void *arg;
+};
+
+struct Hjthread {
+ int id;
+ Channel *job;
+ Channel *done;
+};
+
+struct Hj264 {
+ H264E_persist_t *persist;
+ H264E_scratch_t *scratch;
+ H264E_run_param_t rp;
+ H264E_io_yuv_t yuv;
+ Hjthread threads[H264E_MAX_THREADS];
+ Hjob jobs[H264E_MAX_THREADS];
+ int nthreads;
+ u8int buf[1];
+};
+
+static void
+xrgb2yuv(u8int *src, int stride, int h, H264E_io_yuv_t *io)
+{
+ int x, y, r, g, b;
+ u8int *bgrx, *yuv[3];
+
+ yuv[0] = io->yuv[0];
+ yuv[1] = io->yuv[1];
+ yuv[2] = io->yuv[2];
+
+ for(y = 0; y < h;){
+ bgrx = &src[y * stride];
+ for(x = 0; x < stride/4;){
+ b = bgrx[0];
+ g = bgrx[1];
+ r = bgrx[2];
+ bgrx += 4;
+/* this is not the "full" swing, just sayin' */
+#define YY ((( 66*r + 129*g + 25*b + 128) >> 8) + 16)
+#define UU (((-38*r - 74*g + 112*b + 128) >> 8) + 128)
+#define VV (((112*r - 94*g - 18*b + 128) >> 8) + 128)
+ yuv[0][x] = YY;
+ yuv[1][x/2] = UU;
+ yuv[2][x/2] = VV;
+ x++;
+
+ b = bgrx[0];
+ g = bgrx[1];
+ r = bgrx[2];
+ bgrx += 4;
+ yuv[0][x] = YY;
+ x++;
+ }
+ yuv[0] += io->stride[0];
+ y++;
+
+ for(x = 0; x < stride/4;){
+ b = bgrx[0];
+ g = bgrx[1];
+ r = bgrx[2];
+ bgrx += 4;
+ yuv[0][x] = YY;
+ x++;
+#undef YY
+#undef UU
+#undef VV
+ }
+ yuv[0] += io->stride[0];
+ yuv[1] += io->stride[1];
+ yuv[2] += io->stride[1];
+ y++;
+ }
+}
+
+static void
+threadf(void *p)
+{
+ Hjthread *t;
+ Hjob *j;
+ Channel *job, *done;
+
+ t = p;
+ threadsetname("hj264/%d", t->id);
+
+ job = t->job;
+ done = t->done;
+ for(sendp(done, nil); (j = recvp(job)) != nil; sendp(done, j))
+ j->run(j->arg);
+
+ chanfree(done);
+ chanfree(job);
+
+ threadexits(nil);
+}
+
+static void
+hjobsrun(void *p, void (*run)(void *), void **arg, int njob)
+{
+ int n, t;
+ Hj264 *h;
+ Hjob *j;
+
+ h = p;
+ for(n = 0; n < njob;){
+ for(t = 0; t < h->nthreads && n < njob; t++, n++){
+ j = &h->jobs[t];
+ j->run = run;
+ j->arg = arg[n];
+ sendp(h->threads[t].job, j);
+ }
+
+ for(t--; t >= 0; t--)
+ recvp(h->threads[t].done);
+ }
+}
+
+static int
+hj264_encode(Hj264 *h, u8int **data, int *sz)
+{
+ int e;
+
+ if((e = H264E_encode(h->persist, h->scratch, &h->rp, &h->yuv, data, sz)) != 0){
+ werrstr("H264E_encode: error %d", e);
+ return -1;
+ }
+
+ return 0;
+}
+
+static Hj264 *
+hj264new(int nthreads, int denoise, int kbps, int ww, int hh)
+{
+ int i, e, szscratch, szpersist, szyuv;
+ H264E_create_param_t cp;
+ Hjthread *t;
+ u8int *p;
+ Hj264 *h;
+
+ nthreads = clp(nthreads, 1, H264E_MAX_THREADS);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.num_layers = 1;
+ cp.gop = Gop;
+ cp.max_threads = nthreads;
+ cp.const_input_flag = 1;
+ cp.temporal_denoise_flag = denoise;
+ cp.vbv_size_bytes = kbps/1000*8/2; /* 2 seconds */
+ cp.width = ww;
+ cp.height = hh;
+
+ if((e = H264E_sizeof(&cp, &szpersist, &szscratch)) != 0){
+ werrstr("H264E_sizeof: error %d", e);
+ return nil;
+ }
+
+ /* YUV logic requires alignment */
+ ww = ((ww-1) | 15) + 1;
+ hh = ((hh-1) | 15) + 1;
+ szyuv = ww*hh*3/2;
+ if((h = calloc(1, sizeof(*h) + Align+szyuv + Align+szpersist + Align+szscratch)) == nil)
+ return nil;
+
+ p = align(h->buf, Align);
+ h->yuv.yuv[0] = p;
+ h->yuv.stride[0] = ww;
+ h->yuv.yuv[1] = p + ww*hh;
+ h->yuv.stride[1] = ww/2;
+ h->yuv.yuv[2] = p + ww*hh*5/4;
+ h->yuv.stride[2] = ww/2;
+ h->persist = align(p+szyuv, Align);
+ h->scratch = align(h->persist+szpersist, Align);
+
+ cp.token = h;
+ cp.run_func_in_thread = hjobsrun;
+ H264E_init(h->persist, &cp);
+
+ h->nthreads = nthreads;
+ for(i = 0; i < nthreads; i++){
+ t = &h->threads[i];
+ t->id = i;
+ t->job = chancreate(sizeof(void*), 0);
+ t->done = chancreate(sizeof(void*), 0);
+ proccreate(threadf, t, mainstacksize);
+ recvp(t->done);
+ }
+
+ return h;
+}
+
+static void
+hj264free(Hj264 *h)
+{
+ int i;
+
+ for(i = 0; i < h->nthreads; i++){
+ chanclose(h->threads[i].done);
+ chanclose(h->threads[i].job);
+ }
+
+ free(h);
+}
+
+static uvlong
+nanosec(void)
+{
+ static uvlong fasthz, xstart;
+ uvlong x, div;
+
+ if(fasthz == ~0ULL)
+ return nsec() - xstart;
+
+ if(fasthz == 0){
+ if(_tos->cyclefreq){
+ cycles(&xstart);
+ fasthz = _tos->cyclefreq;
+ } else {
+ xstart = nsec();
+ fasthz = ~0ULL;
+ fprint(2, "cyclefreq not available, falling back to nsec()\n");
+ fprint(2, "you might want to disable aux/timesync\n");
+ return 0;
+ }
+ }
+ cycles(&x);
+ x -= xstart;
+
+ /* this is ugly */
+ for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
+
+ return x / (fasthz / div);
+}
+
+static void
+usage(void)
+{
+ fprint(2, "usage: %s [-d] [-f FPS] [-n THREADS] [-k KBPS] [-q 0…10] [-Q QP]\n", argv0);
+ threadexitsall("usage");
+}
+
+int
+main(int argc, char **argv)
+{
+ int nthreads, fps, kbps, denoise, quality, qp;
+ int ww, hh, in, sz, srcsz, nframes;
+ uvlong start, end;
+ u8int *data, *src;
+ Memimage *im;
+ Biobuf out;
+ Hj264 *h;
+ char *s;
+
+ /* use NPROC-1 threads by default */
+ nthreads = ((s = getenv("NPROC")) != nil) ? atoi(s)-1 : 1;
+ denoise = 0;
+ quality = 10;
+ kbps = 0;
+ fps = 30;
+ qp = 33;
+ ARGBEGIN{
+ case 'd':
+ denoise++;
+ break;
+ case 'f':
+ fps = atoi(EARGF(usage()));
+ break;
+ case 'k':
+ kbps = atoi(EARGF(usage()));
+ break;
+ case 'n':
+ nthreads = atoi(EARGF(usage()));
+ break;
+ case 'q':
+ quality = atoi(EARGF(usage()));
+ break;
+ case 'Q':
+ qp = atoi(EARGF(usage()));
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc < 1)
+ usage();
+ if((in = open(*argv, OREAD)) < 0)
+ sysfatal("input: %r");
+ if(Binit(&out, 1, OWRITE) < 0)
+ sysfatal("Binit failed: %r");
+
+ memimageinit();
+ nanosec();
+
+ if(quality > Maxquality)
+ quality = Maxquality;
+ if(kbps < 0)
+ kbps = 0;
+
+ src = nil;
+ srcsz = 0;
+ h = nil;
+ start = nanosec();
+ for(nframes = 0;; nframes++){
+ seek(in, 0, 0);
+ if((im = readmemimage(in)) == nil)
+ break;
+ ww = Dx(im->r);
+ hh = Dy(im->r);
+
+ if(h == nil){
+ srcsz = Dy(im->r)*(2+bytesperline(im->r, im->depth));
+ if((src = malloc(srcsz)) == nil)
+ sysfatal("memory");
+ unloadmemimage(im, im->r, src, srcsz);
+
+ if((h = hj264new(nthreads, denoise, kbps, ww, hh)) == nil)
+ sysfatal("hj264new: %r");
+ h->rp.encode_speed = Maxquality - quality;
+ h->rp.qp_min = h->rp.qp_max = qp;
+ if(kbps > 0){
+ h->rp.qp_min = 10;
+ h->rp.qp_max = 50;
+ h->rp.desired_frame_bytes = kbps*1000/8/fps;
+ }
+ }
+
+ unloadmemimage(im, im->r, src, srcsz);
+ xrgb2yuv(src, bytesperline(im->r, im->depth), Dy(im->r), &h->yuv);
+ freememimage(im);
+
+ if(hj264_encode(h, &data, &sz) != 0)
+ sysfatal("hj264_encode: %r");
+ if(Bwrite(&out, data, sz) != sz)
+ break;
+ if(nanosec() - start > 4000000000ULL)
+ break;
+ }
+ end = nanosec();
+ fprint(2, "%d fps\n", (int)(nframes / ((end - start)/1000000000ULL)));
+
+ /* FIXME flush on note */
+ Bflush(&out);
+ hj264free(h);
+
+ threadexitsall(nil);
+
+ return 0;
+}
--- /dev/null
+++ b/minih264e.h
@@ -1,0 +1,11718 @@
+#ifndef MINIH264_H
+#define MINIH264_H
+/*
+ https://github.com/lieff/minih264
+ To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+ This software is distributed without any warranty.
+ See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef H264E_SVC_API
+# define H264E_SVC_API 1
+#endif
+
+#ifndef H264E_MAX_THREADS
+# define H264E_MAX_THREADS 4
+#endif
+
+/**
+* API return error codes
+*/
+#define H264E_STATUS_SUCCESS 0
+#define H264E_STATUS_BAD_ARGUMENT 1
+#define H264E_STATUS_BAD_PARAMETER 2
+#define H264E_STATUS_BAD_FRAME_TYPE 3
+#define H264E_STATUS_SIZE_NOT_MULTIPLE_16 4
+#define H264E_STATUS_SIZE_NOT_MULTIPLE_2 5
+#define H264E_STATUS_BAD_LUMA_ALIGN 6
+#define H264E_STATUS_BAD_LUMA_STRIDE 7
+#define H264E_STATUS_BAD_CHROMA_ALIGN 8
+#define H264E_STATUS_BAD_CHROMA_STRIDE 9
+
+/**
+* Frame type definitions
+* - Sequence must start with key (IDR) frame.
+* - P (Predicted) frames are most efficiently coded
+* - Dropable frames may be safely removed from bitstream, and used
+* for frame rate scalability
+* - Golden and Recovery frames used for error recovery. These
+* frames uses "long-term reference" for prediction, and
+* can be decoded if P frames sequence is interrupted.
+* They acts similarly to key frame, but coded more efficiently.
+*
+* Type Refers to Saved as long-term Saved as short-term
+* ---------------------------------------------------------------
+* Key (IDR) : N/A Yes Yes |
+* Golden : long-term Yes Yes |
+* Recovery : long-term No Yes |
+* P : short-term No Yes |
+* Droppable : short-term No No |
+* |
+* Example sequence: K P P G D P R D K |
+* long-term reference 1K 1K 1K 4G 4G 4G 4G 4G 9K |
+* / \ / \ / |
+* coded frame 1K 2P 3P 4G 5D 6P 7R 8D 9K |
+* \ / \ / \ \ / / \ \ / \ |
+* short-term reference 1K 2P 3P 4G 4G 6P 7R 7R 9K |
+*
+*/
+#define H264E_FRAME_TYPE_DEFAULT 0 // Frame type set according to GOP size
+#define H264E_FRAME_TYPE_KEY 6 // Random access point: SPS+PPS+Intra frame
+#define H264E_FRAME_TYPE_I 5 // Intra frame: updates long & short-term reference
+#define H264E_FRAME_TYPE_GOLDEN 4 // Use and update long-term reference
+#define H264E_FRAME_TYPE_RECOVERY 3 // Use long-term reference, updates short-term reference
+#define H264E_FRAME_TYPE_P 2 // Use and update short-term reference
+#define H264E_FRAME_TYPE_DROPPABLE 1 // Use short-term reference, don't update anything
+#define H264E_FRAME_TYPE_CUSTOM 99 // Application specifies reference frame
+
+/**
+* Speed preset index.
+* Currently used values are 0, 1, 8 and 9
+*/
+#define H264E_SPEED_SLOWEST 0 // All coding tools enabled, including denoise filter
+#define H264E_SPEED_BALANCED 5
+#define H264E_SPEED_FASTEST 10 // Minimum tools enabled
+
+/**
+* Creations parameters
+*/
+typedef struct H264E_create_param_tag
+{
+ // Frame width: must be multiple of 16
+ int width;
+
+ // Frame height: must be multiple of 16
+ int height;
+
+ // GOP size == key frame period
+ // If 0: no key frames generated except 1st frame (infinite GOP)
+ // If 1: Only intra-frames produced
+ int gop;
+
+ // Video Buffer Verifier size, bits
+ // If 0: VBV model would be disabled
+ // Note, that this value defines Level,
+ int vbv_size_bytes;
+
+ // If set: transparent frames produced on VBV overflow
+ // If not set: VBV overflow ignored, produce bitrate bigger than specified
+ int vbv_overflow_empty_frame_flag;
+
+ // If set: keep minimum bitrate using stuffing, prevent VBV underflow
+ // If not set: ignore VBV underflow, produce bitrate smaller than specified
+ int vbv_underflow_stuffing_flag;
+
+ // If set: control bitrate at macroblock-level (better bitrate precision)
+ // If not set: control bitrate at frame-level (better quality)
+ int fine_rate_control_flag;
+
+ // If set: don't change input, but allocate additional frame buffer
+ // If not set: use input as a scratch
+ int const_input_flag;
+
+ // If 0: golden, recovery, and custom frames are disabled
+ // If >0: Specifies number of persistent frame buffer's used
+ int max_long_term_reference_frames;
+
+ int enableNEON;
+
+ // If set: enable temporal noise suppression
+ int temporal_denoise_flag;
+
+ int sps_id;
+
+#if H264E_SVC_API
+ // SVC extension
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // Number of SVC layers:
+ // 1 = AVC
+ // 2 = SVC with 2-layers of spatial scalability
+ int num_layers;
+
+ // If set, SVC extension layer will use predictors from base layer
+ // (sometimes can slightly increase efficiency)
+ int inter_layer_pred_flag;
+#endif
+
+#if H264E_MAX_THREADS
+ // Multi-thread extension
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // Maximum threads, supported by the callback
+ int max_threads;
+
+ // Opaque token, passed to callback
+ void *token;
+
+ // Application-supplied callback function.
+ // This callback runs given jobs, by calling provided job_func(), passing
+ // job_data[i] to each one.
+ //
+ // The h264e_thread_pool_run() can be used here, example:
+ //
+ // int max_threads = 4;
+ // void *thread_pool = h264e_thread_pool_init(max_threads);
+ //
+ // H264E_create_param_t par;
+ // par.max_threads = max_threads;
+ // par.token = thread_pool;
+ // par.run_func_in_thread = h264e_thread_pool_run;
+ //
+ // The reason to use double callbacks is to avoid mixing portable and
+ // system-dependent code, and to avoid close() function in the encoder API.
+ //
+ void (*run_func_in_thread)(void *token, void (*job_func)(void*), void *job_data[], int njobs);
+#endif
+
+} H264E_create_param_t;
+
+/**
+* Run-time parameters
+*/
+typedef struct H264E_run_param_tag
+{
+ // Variable, indicating speed/quality tradeoff
+ // 0 means best quality
+ int encode_speed;
+
+ // Frame type override: one of H264E_FRAME_TYPE_* values
+ // if 0: GOP pattern defined by create_param::gop value
+ int frame_type;
+
+ // Used only if frame_type == H264E_FRAME_TYPE_CUSTOM
+ // Reference long-term frame index [1..max_long_term_reference_frames]
+ // 0 = use previous frame (short-term)
+ // -1 = IDR frame, kill all long-term frames
+ int long_term_idx_use;
+
+ // Used only if frame_type == H264E_FRAME_TYPE_CUSTOM
+ // Store decoded frame in long-term buffer with given index in the
+ // range [1..max_long_term_reference_frames]
+ // 0 = save to short-term buffer
+ // -1 = Don't save frame (dropable)
+ int long_term_idx_update;
+
+ // Target frame size. Typically = bitrate/framerate
+ int desired_frame_bytes;
+
+ // Minimum quantizer value, 10 indicates good quality
+ // range: [10; qp_max]
+ int qp_min;
+
+ // Maximum quantizer value, 51 indicates very bad quality
+ // range: [qp_min; 51]
+ int qp_max;
+
+ // Desired NALU size. NALU produced as soon as it's size exceed this value
+ // if 0: frame would be coded with a single NALU
+ int desired_nalu_bytes;
+
+ // Optional NALU notification callback, called by the encoder
+ // as soon as NALU encoding complete.
+ void (*nalu_callback)(
+ const unsigned char *nalu_data, // Coded NALU data, w/o start code
+ int sizeof_nalu_data, // Size of NALU data
+ void *token // optional transparent token
+ );
+
+ // token to pass to NALU callback
+ void *nalu_callback_token;
+
+} H264E_run_param_t;
+
+/**
+* Planar YUV420 descriptor
+*/
+typedef struct H264E_io_yuv_tag
+{
+ // Pointers to 3 pixel planes of YUV image
+ unsigned char *yuv[3];
+ // Stride for each image plane
+ int stride[3];
+} H264E_io_yuv_t;
+
+typedef struct H264E_persist_tag H264E_persist_t;
+typedef struct H264E_scratch_tag H264E_scratch_t;
+
+/**
+* Return persistent and scratch memory requirements
+* for given encoding options.
+*
+* Return value:
+* -zero in case of success
+* -error code (H264E_STATUS_*), if fails
+*
+* example:
+*
+* int sizeof_persist, sizeof_scratch, error;
+* H264E_persist_t * enc;
+* H264E_scratch_t * scratch;
+*
+* error = H264E_sizeof(param, &sizeof_persist, &sizeof_scratch);
+* if (!error)
+* {
+* enc = malloc(sizeof_persist);
+* scratch = malloc(sizeof_scratch);
+* error = H264E_init(enc, param);
+* }
+*/
+int H264E_sizeof(
+ const H264E_create_param_t *param, ///< Encoder creation parameters
+ int *sizeof_persist, ///< [OUT] Size of persistent RAM
+ int *sizeof_scratch ///< [OUT] Size of scratch RAM
+);
+
+/**
+* Initialize encoding session
+*
+* Return value:
+* -zero in case of success
+* -error code (H264E_STATUS_*), if fails
+*/
+int H264E_init(
+ H264E_persist_t *enc, ///< Encoder object
+ const H264E_create_param_t *param ///< Encoder creation parameters
+);
+
+/**
+* Encode single video frame
+*
+* Output buffer is in the scratch RAM
+*
+* Return value:
+* -zero in case of success
+* -error code (H264E_STATUS_*), if fails
+*/
+int H264E_encode(
+ H264E_persist_t *enc, ///< Encoder object
+ H264E_scratch_t *scratch, ///< Scratch memory
+ const H264E_run_param_t *run_param, ///< run-time parameters
+ H264E_io_yuv_t *frame, ///< Input video frame
+ unsigned char **coded_data, ///< [OUT] Pointer to coded data
+ int *sizeof_coded_data ///< [OUT] Size of coded data
+);
+
+/**
+* This is a "hack" function to set internal rate-control state
+* Note that encoder allows application to completely override rate-control,
+* so this function should be used only by lazy coders, who just want to change
+* VBV size, without implementing custom rate-control.
+*
+* Note that H.264 level defined by VBV size on initialization.
+*/
+void H264E_set_vbv_state(
+ H264E_persist_t *enc, ///< Encoder object
+ int vbv_size_bytes, ///< New VBV size
+ int vbv_fullness_bytes ///< New VBV fulness, -1 = no change
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //MINIH264_H
+
+#if defined(MINIH264_IMPLEMENTATION) && !defined(MINIH264_IMPLEMENTATION_GUARD)
+#define MINIH264_IMPLEMENTATION_GUARD
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+/************************************************************************/
+/* Build configuration */
+/************************************************************************/
+#ifndef H264E_ENABLE_DENOISE
+#define H264E_ENABLE_DENOISE 1 // Build-in noise supressor
+#endif
+
+#ifndef MAX_LONG_TERM_FRAMES
+#define MAX_LONG_TERM_FRAMES 8 // Max long-term frames count
+#endif
+
+#if !defined(MINIH264_ONLY_SIMD) && (defined(_M_X64) || defined(_M_ARM64) || defined(__x86_64__) || defined(__aarch64__))
+/* x64 always have SSE2, arm64 always have neon, no need for generic code */
+#define MINIH264_ONLY_SIMD
+#endif /* SIMD checks... */
+
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
+#define H264E_ENABLE_SSE2 1
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <emmintrin.h>
+#endif
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+#define H264E_ENABLE_NEON 1
+#include <arm_neon.h>
+#else
+#ifdef MINIH264_ONLY_SIMD
+#error MINIH264_ONLY_SIMD used, but SSE/NEON not enabled
+#endif
+#endif
+
+#ifndef MINIH264_ONLY_SIMD
+#define H264E_ENABLE_PLAIN_C 1
+#endif
+
+#define H264E_CONFIGS_COUNT ((H264E_ENABLE_SSE2) + (H264E_ENABLE_PLAIN_C) + (H264E_ENABLE_NEON))
+
+#if defined(__ARMCC_VERSION) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define __BYTE_ORDER 0
+#define __BIG_ENDIAN 1
+#elif defined(__linux__) || defined(__CYGWIN__)
+#include <endian.h>
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define __BYTE_ORDER BYTE_ORDER
+#define __BIG_ENDIAN BIG_ENDIAN
+#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/endian.h>
+#elif __plan9__
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#else
+#error platform not supported
+#endif
+
+#if defined(__aarch64__) && defined(__clang__)
+// uintptr_t broken with aarch64 clang on ubuntu 18
+#define uintptr_t unsigned long
+#endif
+#if defined(__arm__) && defined(__clang__)
+#include <arm_acle.h>
+#elif defined(__arm__) && defined(__GNUC__) && !defined(__ARMCC_VERSION)
+static inline unsigned int __usad8(unsigned int val1, unsigned int val2)
+{
+ unsigned int result;
+ __asm__ volatile ("usad8 %0, %1, %2\n\t"
+ : "=r" (result)
+ : "r" (val1), "r" (val2));
+ return result;
+}
+
+static inline unsigned int __usada8(unsigned int val1, unsigned int val2, unsigned int val3)
+{
+ unsigned int result;
+ __asm__ volatile ("usada8 %0, %1, %2, %3\n\t"
+ : "=r" (result)
+ : "r" (val1), "r" (val2), "r" (val3));
+ return result;
+}
+
+static inline unsigned int __sadd16(unsigned int val1, unsigned int val2)
+{
+ unsigned int result;
+ __asm__ volatile ("sadd16 %0, %1, %2\n\t"
+ : "=r" (result)
+ : "r" (val1), "r" (val2));
+ return result;
+}
+
+static inline unsigned int __ssub16(unsigned int val1, unsigned int val2)
+{
+ unsigned int result;
+ __asm__ volatile ("ssub16 %0, %1, %2\n\t"
+ : "=r" (result)
+ : "r" (val1), "r" (val2));
+ return result;
+}
+
+static inline unsigned int __clz(unsigned int val1)
+{
+ unsigned int result;
+ __asm__ volatile ("clz %0, %1\n\t"
+ : "=r" (result)
+ : "r" (val1));
+ return result;
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif //__cplusplus
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+# define h264e_restrict __restrict
+#elif defined(__arm__)
+# define h264e_restrict __restrict
+#else
+# define h264e_restrict
+#endif
+#if defined(_MSC_VER)
+# define ALIGN(n) __declspec(align(n))
+# define ALIGN2(n)
+#else
+# define ALIGN(n)
+# define ALIGN2(n) __attribute__((aligned(n)))
+#endif
+
+#if __GNUC__ || __clang__
+typedef int int_u __attribute__ ((__aligned__ (1)));
+#else
+typedef int int_u;
+#endif
+
+#ifndef MAX
+# define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef MIN
+# define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+# define ABS(x) ((x) >= 0 ? (x) : -(x))
+#endif
+
+#define IS_ALIGNED(p, n) (!((uintptr_t)(p) & (uintptr_t)((n) - 1)))
+
+// bit-stream
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define SWAP32(x) (uint32_t)(x)
+#else
+#ifdef _MSC_VER
+# define SWAP32(x) _byteswap_ulong(x)
+#elif defined(__GNUC__) || defined(__clang__)
+# define SWAP32(x) __builtin_bswap32(x)
+#else
+# define SWAP32(x) (uint32_t)((((x) >> 24) & 0xFF) | (((x) >> 8) & 0xFF00) | (((x) << 8) & 0xFF0000) | ((x & 0xFF) << 24))
+#endif
+#endif
+
+#define BS_OPEN(bs) uint32_t cache = bs->cache; int shift = bs->shift; uint32_t *buf = bs->buf;
+#define BS_CLOSE(bs) bs->cache = cache; bs->shift = shift; bs->buf = buf;
+#define BS_PUT(n, val) \
+if ((shift -= n) < 0) \
+{ \
+ cache |= val >> -shift; \
+ *buf++ = SWAP32(cache); \
+ shift += 32; \
+ cache = 0; \
+} \
+cache |= (uint32_t)val << shift;
+
+// Quantizer-dequantizer modes
+#define QDQ_MODE_INTRA_4 2 // intra 4x4
+#define QDQ_MODE_INTER 8 // inter
+#define QDQ_MODE_INTRA_16 (8 + 1) // intra 16x61
+#define QDQ_MODE_CHROMA (4 + 1) // chroma
+
+// put most frequently used bits to lsb, to use these as look-up tables
+#define AVAIL_TR 8
+#define AVAIL_TL 4
+#define AVAIL_L 2
+#define AVAIL_T 1
+
+typedef uint8_t pix_t;
+typedef uint32_t bs_item_t;
+
+/**
+* Output bitstream
+*/
+typedef struct
+{
+ int shift; // bit position in the cache
+ uint32_t cache; // bit cache
+ bs_item_t *buf; // current position
+ bs_item_t *origin; // initial position
+} bs_t;
+
+/**
+* Tuple for motion vector, or height/width representation
+*/
+typedef union
+{
+ struct
+ {
+ int16_t x; // horizontal or width
+ int16_t y; // vertical or height
+ } s;
+ int32_t u32; // packed representation
+} point_t;
+
+/**
+* Rectangle
+*/
+typedef struct
+{
+ point_t tl; // top-left corner
+ point_t br; // bottom-right corner
+} rectangle_t;
+
+/**
+* Quantized/dequantized representation for 4x4 block
+*/
+typedef struct
+{
+ int16_t qv[16]; // quantized coefficient
+ int16_t dq[16]; // dequantized
+} quant_t;
+
+/**
+* Scratch RAM, used only for current MB encoding
+*/
+typedef struct H264E_scratch_tag
+{
+ pix_t mb_pix_inp[256]; // Input MB (cached)
+ pix_t mb_pix_store[4*256]; // Prediction variants
+
+ // Quantized/dequantized
+ int16_t dcy[16]; // Y DC
+ quant_t qy[16]; // Y 16x4x4 blocks
+
+ int16_t dcu[16]; // U DC: 4 used + align
+ quant_t qu[4]; // U 4x4x4 blocks
+
+ int16_t dcv[16]; // V DC: 4 used + align
+ quant_t qv[4]; // V 4x4x4 blocks
+
+ // Quantized DC:
+ int16_t quant_dc[16]; // Y
+ int16_t quant_dc_u[4]; // U
+ int16_t quant_dc_v[4]; // V
+
+ uint16_t nz_mask; // Bit flags for non-zero 4x4 blocks
+} scratch_t;
+
+/**
+* Deblock filter frame context
+*/
+typedef struct
+{
+ // Motion vectors for 4x4 MB internal sub-blocks, top and left border,
+ // 5x5 array without top-left cell:
+ // T0 T1 T2 T4
+ // L0 i0 i1 i2 i3
+ // L1 ...
+ // ......
+ //
+ point_t df_mv[5*5 - 1]; // MV for current macroblock and neighbors
+ uint8_t *df_qp; // QP for current row of macroblocks
+ int8_t *mb_type; // Macroblock type for current row of macroblocks
+ uint32_t nzflag; // Bit flags for non-zero 4x4 blocks (left neighbors)
+
+ // Huffman and deblock uses different nnz...
+ uint8_t *df_nzflag; // Bit flags for non-zero 4x4 blocks (top neighbors), only 4 bits used
+} deblock_filter_t;
+
+/**
+* Deblock filter parameters for current MB
+*/
+typedef struct
+{
+ uint32_t strength32[4*2]; // Strength for 4 colums and 4 rows
+ uint8_t tc0[16*2]; // TC0 parameter for 4 colums and 4 rows
+ uint8_t alpha[2*2]; // alpha for border/internals
+ uint8_t beta[2*2]; // beta for border/internals
+} deblock_params_t;
+
+/**
+* Persistent RAM
+*/
+typedef struct H264E_persist_tag
+{
+ H264E_create_param_t param; // Copy of create parameters
+ H264E_io_yuv_t inp; // Input picture
+
+ struct
+ {
+ int pic_init_qp; // Initial QP
+ } sps;
+
+ struct
+ {
+ int num; // Frame number
+ int nmbx; // Frame width, macroblocks
+ int nmby; // Frame height, macroblocks
+ int nmb; // Number of macroblocks in frame
+ int w; // Frame width, pixels
+ int h; // Frame height, pixels
+ rectangle_t mv_limit; // Frame MV limits = frame + border extension
+ rectangle_t mv_qpel_limit; // Reduced MV limits for qpel interpolation filter
+ int cropping_flag; // Cropping indicator
+ } frame;
+
+ struct
+ {
+ int type; // Current slice type (I/P)
+ int start_mb_num; // # of 1st MB in the current slice
+ } slice;
+
+ struct
+ {
+ int x; // MB x position (in MB's)
+ int y; // MB y position (in MB's)
+ int num; // MB number
+ int skip_run; // Skip run count
+
+ // according to table 7-13
+ // -1 = skip, 0 = P16x16, 1 = P16x8, 2=P8x16, 3 = P8x8, 5 = I4x4, >=6 = I16x16
+ int type; // MB type
+
+ struct
+ {
+ int pred_mode_luma; // Intra 16x16 prediction mode
+ } i16;
+
+ int8_t i4x4_mode[16]; // Intra 4x4 prediction modes
+
+ int cost; // Best coding cost
+ int avail; // Neighbor availability flags
+ point_t mvd[16]; // Delta-MV for each 4x4 sub-part
+ point_t mv[16]; // MV for each 4x4 sub-part
+
+ point_t mv_skip_pred; // Skip MV predictor
+ } mb;
+
+ H264E_io_yuv_t ref; // Current reference picture
+ H264E_io_yuv_t dec; // Reconstructed current macroblock
+#if H264E_ENABLE_DENOISE
+ H264E_io_yuv_t denoise; // Noise suppression filter
+#endif
+
+ unsigned char *lt_yuv[MAX_LONG_TERM_FRAMES][3]; // Long-term reference pictures
+ unsigned char lt_used[MAX_LONG_TERM_FRAMES]; // Long-term "used" flags
+
+ struct
+ {
+ int qp; // Current QP
+ int vbv_bits; // Current VBV fullness, bits
+ int qp_smooth; // Averaged QP
+ int dqp_smooth; // Adaptive QP adjustment, account for "compressibility"
+ int max_dqp; // Worst-case DQP, for long-term reference QP adjustment
+
+ int bit_budget; // Frame bit budget
+ int prev_qp; // Previous MB QP
+ int prev_err; // Accumulated coded size error
+ int stable_count; // Stable/not stable state machine
+
+ int vbv_target_level; // Desired VBV fullness after frame encode
+
+ // Quantizer data, passed to low-level functions
+ // layout:
+ // multiplier_quant0, multiplier_dequant0,
+ // multiplier_quant2, multiplier_dequant2,
+ // multiplier_quant1, multiplier_dequant1,
+ // rounding_factor_pos,
+ // zero_thr_inter
+ // zero_thr_inter2
+ // ... and same data for chroma
+ //uint16_t qdat[2][(6 + 4)];
+#define OFFS_RND_INTER 6
+#define OFFS_RND_INTRA 7
+#define OFFS_THR_INTER 8
+#define OFFS_THR2_INTER 9
+#define OFFS_THR_1_OFF 10
+#define OFFS_THR_2_OFF 18
+#define OFFS_QUANT_VECT 26
+#define OFFS_DEQUANT_VECT 34
+ //struct
+ //{
+ // uint16_t qdq[6];
+ // uint16_t rnd[2]; // inter/intra
+ // uint16_t thr[2]; // thresholds
+ // uint16_t zero_thr[2][8];
+ // uint16_t qfull[8];
+ // uint16_t dqfull[8];
+ //} qdat[2];
+ uint16_t qdat[2][6 + 2 + 2 + 8 + 8 + 8 + 8];
+ } rc;
+
+ deblock_filter_t df; // Deblock filter
+
+ // Speed/quality trade-off
+ struct
+ {
+ int disable_deblock; // Disable deblock filter flags
+ } speed;
+
+ int most_recent_ref_frame_idx; // Last updated long-term reference
+
+ // predictors contexts
+ point_t *mv_pred; // MV for left&top 4x4 blocks
+ uint8_t *nnz; // Number of non-zero coeffs per 4x4 block for left&top
+ int32_t *i4x4mode; // Intra 4x4 mode for left&top
+ pix_t *top_line; // left&top neighbor pixels
+
+ // output data
+ uint8_t *out; // Output data storage (pointer to scratch RAM!)
+ unsigned int out_pos; // Output byte position
+ bs_t bs[1]; // Output bitbuffer
+
+ scratch_t *scratch; // Pointer to scratch RAM
+#if H264E_MAX_THREADS > 1
+ scratch_t *scratch_store[H264E_MAX_THREADS]; // Pointer to scratch RAM
+ int sizeof_scaratch;
+#endif
+ H264E_run_param_t run_param; // Copy of run-time parameters
+
+ // Consecutive IDR's must have different idr_pic_id,
+ // unless there are some P between them
+ uint8_t next_idr_pic_id;
+
+ pix_t *pbest; // Macroblock best predictor
+ pix_t *ptest; // Macroblock predictor under test
+
+ point_t mv_clusters[2]; // MV clusterization for prediction
+
+ // Flag to track short-term reference buffer, for MMCO 1 command
+ int short_term_used;
+
+#if H264E_SVC_API
+ //svc ext
+ int current_layer;
+ int adaptive_base_mode_flag;
+ void *enc_next;
+#endif
+
+} h264e_enc_t;
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+/************************************************************************/
+/* Constants */
+/************************************************************************/
+
+// Tunable constants can be adjusted by the "training" application
+#ifndef ADJUSTABLE
+# define ADJUSTABLE static const
+#endif
+
+// Huffman encode tables
+#define CODE8(val, len) (uint8_t)((val << 4) + len)
+#define CODE(val, len) (uint8_t)((val << 4) + (len - 1))
+
+const uint8_t h264e_g_run_before[57] =
+{
+ 15, 17, 20, 24, 29, 35, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ /**** Table # 0 size 2 ****/
+ CODE8(1, 1), CODE8(0, 1),
+ /**** Table # 1 size 3 ****/
+ CODE8(1, 1), CODE8(1, 2), CODE8(0, 2),
+ /**** Table # 2 size 4 ****/
+ CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(0, 2),
+ /**** Table # 3 size 5 ****/
+ CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(1, 3), CODE8(0, 3),
+ /**** Table # 4 size 6 ****/
+ CODE8(3, 2), CODE8(2, 2), CODE8(3, 3), CODE8(2, 3), CODE8(1, 3), CODE8(0, 3),
+ /**** Table # 5 size 7 ****/
+ CODE8(3, 2), CODE8(0, 3), CODE8(1, 3), CODE8(3, 3), CODE8(2, 3), CODE8(5, 3), CODE8(4, 3),
+ /**** Table # 6 size 15 ****/
+ CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(2, 3), CODE8(1, 3), CODE8(1, 4),
+ CODE8(1, 5), CODE8(1, 6), CODE8(1, 7), CODE8(1, 8), CODE8(1, 9), CODE8(1, 10), CODE8(1, 11),
+};
+
+const uint8_t h264e_g_total_zeros_cr_2x2[12] =
+{
+ 3, 7, 10,
+ /**** Table # 0 size 4 ****/
+ CODE8(1, 1), CODE8(1, 2), CODE8(1, 3), CODE8(0, 3),
+ /**** Table # 1 size 3 ****/
+ CODE8(1, 1), CODE8(1, 2), CODE8(0, 2),
+ /**** Table # 2 size 2 ****/
+ CODE8(1, 1), CODE8(0, 1),
+};
+
+const uint8_t h264e_g_total_zeros[150] =
+{
+ 15, 31, 46, 60, 73, 85, 96, 106, 115, 123, 130, 136, 141, 145, 148,
+ /**** Table # 0 size 16 ****/
+ CODE8(1, 1), CODE8(3, 3), CODE8(2, 3), CODE8(3, 4), CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(3, 6),
+ CODE8(2, 6), CODE8(3, 7), CODE8(2, 7), CODE8(3, 8), CODE8(2, 8), CODE8(3, 9), CODE8(2, 9), CODE8(1, 9),
+ /**** Table # 1 size 15 ****/
+ CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(5, 4), CODE8(4, 4), CODE8(3, 4),
+ CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(3, 6), CODE8(2, 6), CODE8(1, 6), CODE8(0, 6),
+ /**** Table # 2 size 14 ****/
+ CODE8(5, 4), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 4), CODE8(3, 4), CODE8(4, 3), CODE8(3, 3),
+ CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(1, 6), CODE8(1, 5), CODE8(0, 6),
+ /**** Table # 3 size 13 ****/
+ CODE8(3, 5), CODE8(7, 3), CODE8(5, 4), CODE8(4, 4), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 4),
+ CODE8(3, 3), CODE8(2, 4), CODE8(2, 5), CODE8(1, 5), CODE8(0, 5),
+ /**** Table # 4 size 12 ****/
+ CODE8(5, 4), CODE8(4, 4), CODE8(3, 4), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3),
+ CODE8(2, 4), CODE8(1, 5), CODE8(1, 4), CODE8(0, 5),
+ /**** Table # 5 size 11 ****/
+ CODE8(1, 6), CODE8(1, 5), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(2, 3),
+ CODE8(1, 4), CODE8(1, 3), CODE8(0, 6),
+ /**** Table # 6 size 10 ****/
+ CODE8(1, 6), CODE8(1, 5), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(3, 2), CODE8(2, 3), CODE8(1, 4),
+ CODE8(1, 3), CODE8(0, 6),
+ /**** Table # 7 size 9 ****/
+ CODE8(1, 6), CODE8(1, 4), CODE8(1, 5), CODE8(3, 3), CODE8(3, 2), CODE8(2, 2), CODE8(2, 3), CODE8(1, 3),
+ CODE8(0, 6),
+ /**** Table # 8 size 8 ****/
+ CODE8(1, 6), CODE8(0, 6), CODE8(1, 4), CODE8(3, 2), CODE8(2, 2), CODE8(1, 3), CODE8(1, 2), CODE8(1, 5),
+ /**** Table # 9 size 7 ****/
+ CODE8(1, 5), CODE8(0, 5), CODE8(1, 3), CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(1, 4),
+ /**** Table # 10 size 6 ****/
+ CODE8(0, 4), CODE8(1, 4), CODE8(1, 3), CODE8(2, 3), CODE8(1, 1), CODE8(3, 3),
+ /**** Table # 11 size 5 ****/
+ CODE8(0, 4), CODE8(1, 4), CODE8(1, 2), CODE8(1, 1), CODE8(1, 3),
+ /**** Table # 12 size 4 ****/
+ CODE8(0, 3), CODE8(1, 3), CODE8(1, 1), CODE8(1, 2),
+ /**** Table # 13 size 3 ****/
+ CODE8(0, 2), CODE8(1, 2), CODE8(1, 1),
+ /**** Table # 14 size 2 ****/
+ CODE8(0, 1), CODE8(1, 1),
+};
+
+const uint8_t h264e_g_coeff_token[277 + 18] =
+{
+ 17 + 18, 17 + 18,
+ 82 + 18, 82 + 18,
+ 147 + 18, 147 + 18, 147 + 18, 147 + 18,
+ 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18,
+ 0 + 18,
+ /**** Table # 4 size 17 ****/ // offs: 0
+ CODE(1, 2), CODE(1, 1), CODE(1, 3), CODE(5, 6), CODE(7, 6), CODE(6, 6), CODE(2, 7), CODE(0, 7), CODE(4, 6),
+ CODE(3, 7), CODE(2, 8), CODE(0, 0), CODE(3, 6), CODE(3, 8), CODE(0, 0), CODE(0, 0), CODE(2, 6),
+ /**** Table # 0 size 65 ****/ // offs: 17
+ CODE( 1, 1), CODE( 1, 2), CODE( 1, 3), CODE( 3, 5), CODE( 5, 6), CODE( 4, 6), CODE( 5, 7), CODE( 3, 6),
+ CODE( 7, 8), CODE( 6, 8), CODE( 5, 8), CODE( 4, 7), CODE( 7, 9), CODE( 6, 9), CODE( 5, 9), CODE( 4, 8),
+ CODE( 7, 10), CODE( 6, 10), CODE( 5, 10), CODE( 4, 9), CODE( 7, 11), CODE( 6, 11), CODE( 5, 11), CODE( 4, 10),
+ CODE(15, 13), CODE(14, 13), CODE(13, 13), CODE( 4, 11), CODE(11, 13), CODE(10, 13), CODE( 9, 13), CODE(12, 13),
+ CODE( 8, 13), CODE(14, 14), CODE(13, 14), CODE(12, 14), CODE(15, 14), CODE(10, 14), CODE( 9, 14), CODE( 8, 14),
+ CODE(11, 14), CODE(14, 15), CODE(13, 15), CODE(12, 15), CODE(15, 15), CODE(10, 15), CODE( 9, 15), CODE( 8, 15),
+ CODE(11, 15), CODE( 1, 15), CODE(13, 16), CODE(12, 16), CODE(15, 16), CODE(14, 16), CODE( 9, 16), CODE( 8, 16),
+ CODE(11, 16), CODE(10, 16), CODE( 5, 16), CODE( 0, 0), CODE( 7, 16), CODE( 6, 16), CODE( 0, 0), CODE( 0, 0), CODE( 4, 16),
+ /**** Table # 1 size 65 ****/ // offs: 82
+ CODE( 3, 2), CODE( 2, 2), CODE( 3, 3), CODE( 5, 4), CODE(11, 6), CODE( 7, 5), CODE( 9, 6), CODE( 4, 4),
+ CODE( 7, 6), CODE(10, 6), CODE( 5, 6), CODE( 6, 5), CODE( 7, 7), CODE( 6, 6), CODE( 5, 7), CODE( 8, 6),
+ CODE( 7, 8), CODE( 6, 7), CODE( 5, 8), CODE( 4, 6), CODE( 4, 8), CODE( 6, 8), CODE( 5, 9), CODE( 4, 7),
+ CODE( 7, 9), CODE( 6, 9), CODE(13, 11), CODE( 4, 9), CODE(15, 11), CODE(14, 11), CODE( 9, 11), CODE(12, 11),
+ CODE(11, 11), CODE(10, 11), CODE(13, 12), CODE( 8, 11), CODE(15, 12), CODE(14, 12), CODE( 9, 12), CODE(12, 12),
+ CODE(11, 12), CODE(10, 12), CODE(13, 13), CODE(12, 13), CODE( 8, 12), CODE(14, 13), CODE( 9, 13), CODE( 8, 13),
+ CODE(15, 13), CODE(10, 13), CODE( 6, 13), CODE( 1, 13), CODE(11, 13), CODE(11, 14), CODE(10, 14), CODE( 4, 14),
+ CODE( 7, 13), CODE( 8, 14), CODE( 5, 14), CODE( 0, 0), CODE( 9, 14), CODE( 6, 14), CODE( 0, 0), CODE( 0, 0), CODE( 7, 14),
+ /**** Table # 2 size 65 ****/ // offs: 147
+ CODE(15, 4), CODE(14, 4), CODE(13, 4), CODE(12, 4), CODE(15, 6), CODE(15, 5), CODE(14, 5), CODE(11, 4),
+ CODE(11, 6), CODE(12, 5), CODE(11, 5), CODE(10, 4), CODE( 8, 6), CODE(10, 5), CODE( 9, 5), CODE( 9, 4),
+ CODE(15, 7), CODE( 8, 5), CODE(13, 6), CODE( 8, 4), CODE(11, 7), CODE(14, 6), CODE( 9, 6), CODE(13, 5),
+ CODE( 9, 7), CODE(10, 6), CODE(13, 7), CODE(12, 6), CODE( 8, 7), CODE(14, 7), CODE(10, 7), CODE(12, 7),
+ CODE(15, 8), CODE(14, 8), CODE(13, 8), CODE(12, 8), CODE(11, 8), CODE(10, 8), CODE( 9, 8), CODE( 8, 8),
+ CODE(15, 9), CODE(14, 9), CODE(13, 9), CODE(12, 9), CODE(11, 9), CODE(10, 9), CODE( 9, 9), CODE(10, 10),
+ CODE( 8, 9), CODE( 7, 9), CODE(11, 10), CODE( 6, 10), CODE(13, 10), CODE(12, 10), CODE( 7, 10), CODE( 2, 10),
+ CODE( 9, 10), CODE( 8, 10), CODE( 3, 10), CODE( 0, 0), CODE( 5, 10), CODE( 4, 10), CODE( 0, 0), CODE( 0, 0), CODE( 1, 10),
+ /**** Table # 3 size 65 ****/ // offs: 212
+ 3, 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 19, 8, 13, 18, 23, 12, 17, 22, 27, 16, 21, 26, 31, 20, 25, 30, 35,
+ 24, 29, 34, 39, 28, 33, 38, 43, 32, 37, 42, 47, 36, 41, 46, 51, 40, 45, 50, 55, 44, 49, 54, 59, 48, 53, 58, 63,
+ 52, 57, 62, 0, 56, 61, 0, 0, 60
+};
+
+/*
+ Block scan order
+ 0 1 4 5
+ 2 3 6 7
+ 8 9 C D
+ A B E F
+*/
+static const uint8_t decode_block_scan[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 };
+
+static const uint8_t qpy2qpc[52] = { // todo: [0 - 9] not used
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35,
+ 35, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 39,
+};
+
+/**
+* Rate-control LUT for intra/inter macroblocks: number of bits per macroblock for given QP
+* Estimated experimentally
+*/
+static const uint16_t bits_per_mb[2][42 - 1] =
+{
+ // 10 20 30 40 50
+ { 664, 597, 530, 484, 432, 384, 341, 297, 262, 235, 198, 173, 153, 131, 114, 102, 84, 74, 64, 54, 47, 42, 35, 31, 26, 22, 20, 17, 15, 13, 12, 10, 9, 9, 7, 7, 6, 5, 4, 1, 1}, // P
+ {1057, 975, 925, 868, 803, 740, 694, 630, 586, 547, 496, 457, 420, 378, 345, 318, 284, 258, 234, 210, 190, 178, 155, 141, 129, 115, 102, 95, 82, 75, 69, 60, 55, 51, 45, 41, 40, 35, 31, 28, 24} // I
+};
+
+/**
+* Deblock filter constants:
+* <alpha> <thr[1]> <thr[2]> <thr[3]> <beta>
+*/
+static const uint8_t g_a_tc0_b[52 - 10][5] = {
+ { 0, 0, 0, 0, 0}, // 10
+ { 0, 0, 0, 0, 0}, // 11
+ { 0, 0, 0, 0, 0}, // 12
+ { 0, 0, 0, 0, 0}, // 13
+ { 0, 0, 0, 0, 0}, // 14
+ { 0, 0, 0, 0, 0}, // 15
+ { 4, 0, 0, 0, 2},
+ { 4, 0, 0, 1, 2},
+ { 5, 0, 0, 1, 2},
+ { 6, 0, 0, 1, 3},
+ { 7, 0, 0, 1, 3},
+ { 8, 0, 1, 1, 3},
+ { 9, 0, 1, 1, 3},
+ { 10, 1, 1, 1, 4},
+ { 12, 1, 1, 1, 4},
+ { 13, 1, 1, 1, 4},
+ { 15, 1, 1, 1, 6},
+ { 17, 1, 1, 2, 6},
+ { 20, 1, 1, 2, 7},
+ { 22, 1, 1, 2, 7},
+ { 25, 1, 1, 2, 8},
+ { 28, 1, 2, 3, 8},
+ { 32, 1, 2, 3, 9},
+ { 36, 2, 2, 3, 9},
+ { 40, 2, 2, 4, 10},
+ { 45, 2, 3, 4, 10},
+ { 50, 2, 3, 4, 11},
+ { 56, 3, 3, 5, 11},
+ { 63, 3, 4, 6, 12},
+ { 71, 3, 4, 6, 12},
+ { 80, 4, 5, 7, 13},
+ { 90, 4, 5, 8, 13},
+ {101, 4, 6, 9, 14},
+ {113, 5, 7, 10, 14},
+ {127, 6, 8, 11, 15},
+ {144, 6, 8, 13, 15},
+ {162, 7, 10, 14, 16},
+ {182, 8, 11, 16, 16},
+ {203, 9, 12, 18, 17},
+ {226, 10, 13, 20, 17},
+ {255, 11, 15, 23, 18},
+ {255, 13, 17, 25, 18},
+};
+
+/************************************************************************/
+/* Adjustable encoder parameters. Initial MIN_QP values never used */
+/************************************************************************/
+
+ADJUSTABLE uint16_t g_rnd_inter[] = {
+ 11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665,
+ 11665, 12868, 14071, 15273, 16476,
+ 17679, 17740, 17801, 17863, 17924,
+ 17985, 17445, 16904, 16364, 15823,
+ 15283, 15198, 15113, 15027, 14942,
+ 14857, 15667, 16478, 17288, 18099,
+ 18909, 19213, 19517, 19822, 20126,
+ 20430, 16344, 12259, 8173, 4088,
+ 4088, 4088, 4088, 4088, 4088,
+ 4088, 4088,
+};
+
+ADJUSTABLE uint16_t g_thr_inter[] = {
+ 31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878,
+ 31878, 33578, 35278, 36978, 38678,
+ 40378, 41471, 42563, 43656, 44748,
+ 45841, 46432, 47024, 47615, 48207,
+ 48798, 49354, 49911, 50467, 51024,
+ 51580, 51580, 51580, 51580, 51580,
+ 51580, 52222, 52864, 53506, 54148,
+ 54790, 45955, 37120, 28286, 19451,
+ 10616, 9326, 8036, 6745, 5455,
+ 4165, 4165,
+};
+
+ADJUSTABLE uint16_t g_thr_inter2[] = {
+ 45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352,
+ 45352, 41100, 36848, 32597, 28345,
+ 24093, 25904, 27715, 29525, 31336,
+ 33147, 33429, 33711, 33994, 34276,
+ 34558, 32902, 31246, 29590, 27934,
+ 26278, 26989, 27700, 28412, 29123,
+ 29834, 29038, 28242, 27445, 26649,
+ 25853, 23440, 21028, 18615, 16203,
+ 13790, 11137, 8484, 5832, 3179,
+ 526, 526,
+};
+
+ADJUSTABLE uint16_t g_skip_thr_inter[52] =
+{
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 44, 44,
+ 44, 40, 37, 33, 30,
+ 26, 32, 38, 45, 51,
+ 57, 58, 58, 59, 59,
+ 60, 66, 73, 79, 86,
+ 92, 95, 98, 100, 103,
+ 106, 200, 300, 400, 500,
+ 600, 700, 800, 900, 1000,
+ 1377, 1377,
+};
+
+ADJUSTABLE uint16_t g_lambda_q4[52] =
+{
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 13, 11, 10, 8,
+ 7, 11, 15, 20, 24,
+ 28, 30, 31, 33, 34,
+ 36, 48, 60, 71, 83,
+ 95, 95, 95, 96, 96,
+ 96, 113, 130, 147, 164,
+ 181, 401, 620, 840, 1059,
+ 1279, 1262, 1246, 1229, 1213,
+ 1196, 1196,
+};
+ADJUSTABLE uint16_t g_lambda_mv_q4[52] =
+{
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 14, 15, 15, 16,
+ 17, 18, 20, 21, 23,
+ 24, 28, 32, 37, 41,
+ 45, 53, 62, 70, 79,
+ 87, 105, 123, 140, 158,
+ 176, 195, 214, 234, 253,
+ 272, 406, 541, 675, 810,
+ 944, 895, 845, 796, 746,
+ 697, 697,
+};
+
+ADJUSTABLE uint16_t g_skip_thr_i4x4[52] =
+{
+ 0,1,2,3,4,5,6,7,8,9,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+ 100, 100,
+};
+
+ADJUSTABLE uint16_t g_deadzonei[] = {
+ 3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419,
+ 30550, 8845, 14271, 19698, 25124,
+ 30550, 29556, 28562, 27569, 26575,
+ 25581, 25284, 24988, 24691, 24395,
+ 24098, 24116, 24134, 24153, 24171,
+ 24189, 24010, 23832, 23653, 23475,
+ 23296, 23569, 23842, 24115, 24388,
+ 24661, 19729, 14797, 9865, 4933,
+ 24661, 3499, 6997, 10495, 13993,
+ 17491, 17491,
+};
+
+ADJUSTABLE uint16_t g_lambda_i4_q4[] = {
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 31, 34, 38, 41,
+ 45, 76, 106, 137, 167,
+ 198, 220, 243, 265, 288,
+ 310, 347, 384, 421, 458,
+ 495, 584, 673, 763, 852,
+ 941, 1053, 1165, 1276, 1388,
+ 1500, 1205, 910, 614, 319,
+ 5000, 1448, 2872, 4296, 5720,
+ 7144, 7144,
+};
+
+ADJUSTABLE uint16_t g_lambda_i16_q4[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0,
+ 0, 3, 7, 10, 14,
+ 17, 14, 10, 7, 3,
+ 50, 20, 39, 59, 78,
+ 98, 94, 89, 85, 80,
+ 76, 118, 161, 203, 246,
+ 288, 349, 410, 470, 531,
+ 592, 575, 558, 540, 523,
+ 506, 506,
+};
+
+const uint8_t g_diff_to_gainQ8[256] =
+{
+ 0, 16, 25, 32, 37, 41, 44, 48, 50, 53, 55, 57, 59, 60, 62, 64, 65,
+ 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 80, 80,
+ 81, 82, 82, 83, 83, 84, 85, 85, 86, 86, 87, 87, 88, 88, 89, 89,
+ 90, 90, 91, 91, 92, 92, 92, 93, 93, 94, 94, 94, 95, 95, 96, 96,
+ 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 100, 100, 100, 101, 101,
+ 101, 102, 102, 102, 102, 103, 103, 103, 103, 104, 104, 104, 104, 105, 105, 105,
+ 105, 106, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 108, 108, 108, 109,
+ 109, 109, 109, 109, 110, 110, 110, 110, 110, 111, 111, 111, 111, 111, 112, 112,
+ 112, 112, 112, 112, 113, 113, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114,
+ 115, 115, 115, 115, 115, 115, 115, 116, 116, 116, 116, 116, 116, 117, 117, 117,
+ 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119,
+ 119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 120, 120, 121, 121, 121, 121,
+ 121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 122, 122, 122, 123, 123, 123,
+ 123, 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 124, 124, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126,
+ 126, 126, 126, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 128,
+};
+
+#if H264E_ENABLE_SSE2 && !defined(MINIH264_ASM)
+#define BS_BITS 32
+
+static void h264e_bs_put_bits_sse2(bs_t *bs, unsigned n, unsigned val)
+{
+ assert(!(val >> n));
+ bs->shift -= n;
+ assert((unsigned)n <= 32);
+ if (bs->shift < 0)
+ {
+ assert(-bs->shift < 32);
+ bs->cache |= val >> -bs->shift;
+ *bs->buf++ = SWAP32(bs->cache);
+ bs->shift = 32 + bs->shift;
+ bs->cache = 0;
+ }
+ bs->cache |= val << bs->shift;
+}
+
+static void h264e_bs_flush_sse2(bs_t *bs)
+{
+ *bs->buf = SWAP32(bs->cache);
+}
+
+static unsigned h264e_bs_get_pos_bits_sse2(const bs_t *bs)
+{
+ unsigned pos_bits = (unsigned)((bs->buf - bs->origin)*BS_BITS);
+ pos_bits += BS_BITS - bs->shift;
+ assert((int)pos_bits >= 0);
+ return pos_bits;
+}
+
+static unsigned h264e_bs_byte_align_sse2(bs_t *bs)
+{
+ int pos = h264e_bs_get_pos_bits_sse2(bs);
+ h264e_bs_put_bits_sse2(bs, -pos & 7, 0);
+ return pos + (-pos & 7);
+}
+
+/**
+* Golomb code
+* 0 => 1
+* 1 => 01 0
+* 2 => 01 1
+* 3 => 001 00
+* 4 => 001 01
+*
+* [0] => 1
+* [1..2] => 01x
+* [3..6] => 001xx
+* [7..14] => 0001xxx
+*
+*/
+static void h264e_bs_put_golomb_sse2(bs_t *bs, unsigned val)
+{
+ int size;
+#if defined(_MSC_VER)
+ unsigned long nbit;
+ _BitScanReverse(&nbit, val + 1);
+ size = 1 + nbit;
+#else
+ size = 32 - __builtin_clz(val + 1);
+#endif
+ h264e_bs_put_bits_sse2(bs, 2*size - 1, val + 1);
+}
+
+/**
+* signed Golomb code.
+* mapping to unsigned code:
+* 0 => 0
+* 1 => 1
+* -1 => 2
+* 2 => 3
+* -2 => 4
+* 3 => 5
+* -3 => 6
+*/
+static void h264e_bs_put_sgolomb_sse2(bs_t *bs, int val)
+{
+ val = 2*val - 1;
+ val ^= val >> 31;
+ h264e_bs_put_golomb_sse2(bs, val);
+}
+
+static void h264e_bs_init_bits_sse2(bs_t *bs, void *data)
+{
+ bs->origin = data;
+ bs->buf = bs->origin;
+ bs->shift = BS_BITS;
+ bs->cache = 0;
+}
+
+static unsigned __clz_cavlc(unsigned v)
+{
+#if defined(_MSC_VER)
+ unsigned long nbit;
+ _BitScanReverse(&nbit, v);
+ return 31 - nbit;
+#else
+ return __builtin_clz(v);
+#endif
+}
+
+static void h264e_vlc_encode_sse2(bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx)
+{
+ int nnz_context, nlevels, nnz; // nnz = nlevels + trailing_ones
+ unsigned trailing_ones = 0;
+ unsigned trailing_ones_sign = 0;
+ uint8_t runs[16];
+ uint8_t *prun = runs;
+ int16_t *levels;
+ int cloop = maxNumCoeff;
+ int v, drun;
+ unsigned zmask;
+ BS_OPEN(bs)
+
+ ALIGN(16) int16_t zzquant[16] ALIGN2(16);
+ levels = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+ if (maxNumCoeff != 4)
+ {
+ __m128i y0, y1;
+ __m128i x0 = _mm_load_si128((__m128i *)quant);
+ __m128i x1 = _mm_load_si128((__m128i *)(quant + 8));
+#define SWAP_XMM(x, i, j) { int t0 = _mm_extract_epi16(x, i); int t1 = _mm_extract_epi16(x, j); x = _mm_insert_epi16(x, t0, j); x = _mm_insert_epi16(x, t1, i); }
+#define SWAP_XMM2(x, y, i, j) { int t0 = _mm_extract_epi16(x, i); int t1 = _mm_extract_epi16(y, j); y = _mm_insert_epi16(y, t0, j); x = _mm_insert_epi16(x, t1, i); }
+ SWAP_XMM(x0, 3, 4);
+ SWAP_XMM(x1, 3, 4);
+ SWAP_XMM2(x0, x1, 5, 2);
+ x0 = _mm_shufflelo_epi16(x0, 0 + (3 << 2) + (1 << 4) + (2 << 6));
+ x0 = _mm_shufflehi_epi16(x0, 2 + (0 << 2) + (3 << 4) + (1 << 6));
+ x1 = _mm_shufflelo_epi16(x1, 2 + (0 << 2) + (3 << 4) + (1 << 6));
+ x1 = _mm_shufflehi_epi16(x1, 1 + (2 << 2) + (0 << 4) + (3 << 6));
+ y0 = _mm_unpacklo_epi64(x0, x1);
+ y1 = _mm_unpackhi_epi64(x0, x1);
+ y0 = _mm_slli_epi16(y0, 1);
+ y1 = _mm_slli_epi16(y1, 1);
+ zmask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_packs_epi16(y0, y1), _mm_setzero_si128()));
+ _mm_store_si128((__m128i *)zzquant, y0);
+ _mm_store_si128((__m128i *)(zzquant + 8), y1);
+
+ if (maxNumCoeff == 15)
+ zmask |= 1;
+ zmask = (~zmask) << 16;
+
+ v = 15;
+ drun = (maxNumCoeff == 16) ? 1 : 0;
+ } else
+ {
+ __m128i x0 = _mm_loadl_epi64((__m128i *)quant);
+ x0 = _mm_slli_epi16(x0, 1);
+ zmask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_packs_epi16(x0, x0), _mm_setzero_si128()));
+ _mm_storel_epi64((__m128i *)zzquant, x0);
+ zmask = (~zmask) << 28;
+ drun = 1;
+ v = 3;
+ }
+
+ if (zmask)
+ {
+ do
+ {
+ int i = __clz_cavlc(zmask);
+ *--levels = zzquant[v -= i];
+ *prun++ = (uint8_t)(v + drun);
+ zmask <<= (i + 1);
+ v--;
+ } while(zmask);
+ quant = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+ nnz = (int)(quant - levels);
+
+ cloop = MIN(3, nnz);
+ levels = quant - 1;
+ do
+ {
+ if ((unsigned)(*levels + 2) > 4u)
+ {
+ break;
+ }
+ trailing_ones_sign = (trailing_ones_sign << 1) | (*levels-- < 0);
+ trailing_ones++;
+ } while (--cloop);
+ } else
+ {
+ nnz = trailing_ones = 0;
+ }
+ nlevels = nnz - trailing_ones;
+
+ nnz_context = nz_ctx[-1] + nz_ctx[1];
+
+ nz_ctx[0] = (uint8_t)nnz;
+ if (nnz_context <= 34)
+ {
+ nnz_context = (nnz_context + 1) >> 1;
+ }
+ nnz_context &= 31;
+
+ // 9.2.1 Parsing process for total number of transform coefficient levels and trailing ones
+ {
+ int off = h264e_g_coeff_token[nnz_context];
+ unsigned n = 6, val = h264e_g_coeff_token[off + trailing_ones + 4*nlevels];
+ if (off != 230)
+ {
+ n = (val & 15) + 1;
+ val >>= 4;
+ }
+ BS_PUT(n, val);
+ }
+
+ if (nnz)
+ {
+ if (trailing_ones)
+ {
+ BS_PUT(trailing_ones, trailing_ones_sign);
+ }
+ if (nlevels)
+ {
+ int vlcnum = 1;
+ int sym_len, prefix_len;
+
+ int sym = *levels-- - 2;
+ if (sym < 0) sym = -3 - sym;
+ if (sym >= 6) vlcnum++;
+ if (trailing_ones < 3)
+ {
+ sym -= 2;
+ if (nnz > 10)
+ {
+ sym_len = 1;
+ prefix_len = sym >> 1;
+ if (prefix_len >= 15)
+ {
+ // or vlcnum = 1; goto escape;
+ prefix_len = 15;
+ sym_len = 12;
+ }
+ sym -= prefix_len << 1;
+ // bypass vlcnum advance due to sym -= 2; above
+ goto loop_enter;
+ }
+ }
+
+ if (sym < 14)
+ {
+ prefix_len = sym;
+ sym = 0; // to avoid side effect in bitbuf
+ sym_len = 0;
+ } else if (sym < 30)
+ {
+ prefix_len = 14;
+ sym_len = 4;
+ sym -= 14;
+ } else
+ {
+ vlcnum = 1;
+ goto escape;
+ }
+ goto loop_enter;
+
+ for (;;)
+ {
+ sym_len = vlcnum;
+ prefix_len = sym >> vlcnum;
+ if (prefix_len >= 15)
+ {
+escape:
+ prefix_len = 15;
+ sym_len = 12;
+ }
+ sym -= prefix_len << vlcnum;
+
+ if (prefix_len >= 3 && vlcnum < 6) vlcnum++;
+loop_enter:
+ sym |= 1 << sym_len;
+ sym_len += prefix_len+1;
+ BS_PUT(sym_len, (unsigned)sym);
+ if (!--nlevels) break;
+ sym = *levels-- - 2;
+ if (sym < 0) sym = -3 - sym;
+ }
+ }
+
+ if (nnz < maxNumCoeff)
+ {
+ const uint8_t *vlc = (maxNumCoeff == 4) ? h264e_g_total_zeros_cr_2x2 : h264e_g_total_zeros;
+ uint8_t *run = runs;
+ int run_prev = *run++;
+ int nzeros = run_prev - nnz;
+ int zeros_left = 2*nzeros - 1;
+ int ctx = nnz - 1;
+ run[nnz - 1] = (uint8_t)maxNumCoeff; // terminator
+ for(;;)
+ {
+ int t;
+ //encode_huff8(bs, vlc, ctx, nzeros);
+
+ unsigned val = vlc[vlc[ctx] + nzeros];
+ unsigned n = val & 15;
+ val >>= 4;
+ BS_PUT(n, val);
+
+ zeros_left -= nzeros;
+ if (zeros_left < 0)
+ {
+ break;
+ }
+
+ t = *run++;
+ nzeros = run_prev - t - 1;
+ if (nzeros < 0)
+ {
+ break;
+ }
+ run_prev = t;
+ assert(zeros_left < 14);
+ vlc = h264e_g_run_before;
+ ctx = zeros_left;
+ }
+ }
+ }
+ BS_CLOSE(bs);
+}
+
+#define MM_LOAD_8TO16_2(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), _mm_setzero_si128())
+static __inline __m128i subabs128_16(__m128i a, __m128i b)
+{
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+static __inline __m128i clone2x16(const void *p)
+{
+ __m128i tmp = MM_LOAD_8TO16_2(p);
+ return _mm_unpacklo_epi16(tmp, tmp);
+}
+static __inline __m128i subabs128(__m128i a, __m128i b)
+{
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+static void transpose8x8_sse(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride)
+{
+ __m128i a = _mm_loadl_epi64((__m128i *)(src));
+ __m128i b = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i c = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i d = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i e = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i f = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i g = _mm_loadl_epi64((__m128i *)(src += src_stride));
+ __m128i h = _mm_loadl_epi64((__m128i *)(src += src_stride));
+
+ __m128i p0 = _mm_unpacklo_epi8(a,b); // b7 a7 b6 a6 ... b0 a0
+ __m128i p1 = _mm_unpacklo_epi8(c,d); // d7 c7 d6 c6 ... d0 c0
+ __m128i p2 = _mm_unpacklo_epi8(e,f); // f7 e7 f6 e6 ... f0 e0
+ __m128i p3 = _mm_unpacklo_epi8(g,h); // h7 g7 h6 g6 ... h0 g0
+
+ __m128i q0 = _mm_unpacklo_epi16(p0, p1); // d3c3 b3a3 ... d0c0 b0a0
+ __m128i q1 = _mm_unpackhi_epi16(p0, p1); // d7c7 b7a7 ... d4c4 b4a4
+ __m128i q2 = _mm_unpacklo_epi16(p2, p3); // h3g3 f3e3 ... h0g0 f0e0
+ __m128i q3 = _mm_unpackhi_epi16(p2, p3); // h7g7 f7e7 ... h4g4 f4e4
+
+ __m128i r0 = _mm_unpacklo_epi32(q0, q2); // h1g1f1e1 d1c1b1a1 h0g0f0e0 d0c0b0a0
+ __m128i r1 = _mm_unpackhi_epi32(q0, q2); // h3g3f3e3 d3c3b3a3 h2g2f2e2 d2c2b2a2
+ __m128i r2 = _mm_unpacklo_epi32(q1, q3);
+ __m128i r3 = _mm_unpackhi_epi32(q1, q3);
+ _mm_storel_epi64((__m128i *)(dst), r0); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r0, r0)); dst += dst_stride;
+ _mm_storel_epi64((__m128i *)(dst), r1); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r1, r1)); dst += dst_stride;
+ _mm_storel_epi64((__m128i *)(dst), r2); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r2, r2)); dst += dst_stride;
+ _mm_storel_epi64((__m128i *)(dst), r3); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r3, r3)); dst += dst_stride;
+}
+
+static void deblock_chroma_h_s4_sse(uint8_t *pq0, int stride, const void* threshold, int alpha, int beta, uint32_t argstr)
+{
+ __m128i thr, str, d;
+ __m128i p1 = MM_LOAD_8TO16_2(pq0 - 2*stride);
+ __m128i p0 = MM_LOAD_8TO16_2(pq0 - stride);
+ __m128i q0 = MM_LOAD_8TO16_2(pq0);
+ __m128i q1 = MM_LOAD_8TO16_2(pq0 + stride);
+ __m128i zero = _mm_setzero_si128();
+ __m128i _alpha = _mm_set1_epi16((short)alpha);
+ __m128i _beta = _mm_set1_epi16((short)beta);
+ __m128i tmp;
+
+ str = _mm_cmplt_epi16(subabs128_16(p0, q0), _alpha);
+ str = _mm_and_si128(str, _mm_cmplt_epi16(_mm_max_epi16(subabs128_16(p1, p0), subabs128_16(q1, q0)), _beta));
+
+ if ((uint8_t)argstr != 4)
+ {
+ d = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_sub_epi16(q0, p0), 2), p1), q1),_mm_set1_epi16(4)), 3);
+ thr = _mm_add_epi16(clone2x16(threshold), _mm_set1_epi16(1));
+ d = _mm_min_epi16(_mm_max_epi16(d, _mm_sub_epi16(zero, thr)), thr);
+
+ tmp = _mm_unpacklo_epi8(_mm_cvtsi32_si128(argstr), _mm_setzero_si128());
+ tmp = _mm_unpacklo_epi16(tmp, tmp);
+
+// str = _mm_and_si128(str, _mm_cmpgt_epi16(clone2x16(strength), zero));
+ str = _mm_and_si128(str, _mm_cmpgt_epi16(tmp, zero));
+ d = _mm_and_si128(str, d);
+ p0 = _mm_add_epi16(p0, d);
+ q0 = _mm_sub_epi16(q0, d);
+ } else
+ {
+ __m128i pq = _mm_add_epi16(p1, q1);
+ __m128i newp = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(pq, p1), p0), 1);
+ __m128i newq = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(pq, q1), q0), 1);
+ p0 = _mm_xor_si128(_mm_and_si128(_mm_xor_si128(_mm_avg_epu16(newp,zero), p0), str), p0);
+ q0 = _mm_xor_si128(_mm_and_si128(_mm_xor_si128(_mm_avg_epu16(newq,zero), q0), str), q0);
+ }
+ _mm_storel_epi64((__m128i*)(pq0 - stride), _mm_packus_epi16(p0, zero));
+ _mm_storel_epi64((__m128i*)(pq0 ), _mm_packus_epi16(q0, zero));
+}
+
+static void deblock_chroma_v_s4_sse(uint8_t *pix, int stride, const void* threshold, int alpha, int beta, uint32_t str)
+{
+ uint8_t t8x4[8*4];
+ int i;
+ uint8_t *p = pix - 2;
+ __m128i t0 =_mm_unpacklo_epi16(
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)p), _mm_cvtsi32_si128(*(int_u*)(p + stride))),
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 2*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 3*stride)))
+ );
+ __m128i t1 =_mm_unpacklo_epi16(
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 4*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 5*stride))),
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 6*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 7*stride)))
+ );
+ __m128i p1 = _mm_unpacklo_epi32(t0, t1);
+ __m128i p0 = _mm_shuffle_epi32 (p1, 0x4E); // 01001110b
+ __m128i q0 = _mm_unpackhi_epi32(t0, t1);
+ __m128i q1 = _mm_shuffle_epi32 (q0, 0x4E);
+ _mm_storel_epi64((__m128i*)(t8x4), p1);
+ _mm_storel_epi64((__m128i*)(t8x4 + 8), p0);
+ _mm_storel_epi64((__m128i*)(t8x4 + 16), q0);
+ _mm_storel_epi64((__m128i*)(t8x4 + 24), q1);
+ deblock_chroma_h_s4_sse(t8x4 + 16, 8, threshold, alpha, beta, str);
+
+ for (i = 0; i < 8; i++)
+ {
+ pix[-1] = t8x4[8 + i];
+ pix[ 0] = t8x4[16 + i];
+ pix += stride;
+ }
+}
+
+#define CMP_BETA(p, q, beta) _mm_cmpeq_epi8(_mm_subs_epu8(_mm_subs_epu8(p, q), beta), _mm_subs_epu8(_mm_subs_epu8(q, p), beta))
+#define CMP_1(p, q, beta) (_mm_subs_epu8(subabs128(p, q), beta))
+
+static void deblock_luma_h_s4_sse(uint8_t *pix, int stride, int alpha, int beta)
+{
+ int ccloop = 2;
+ do
+ {
+ __m128i p3 = MM_LOAD_8TO16_2(pix - 4*stride);
+ __m128i p2 = MM_LOAD_8TO16_2(pix - 3*stride);
+ __m128i p1 = MM_LOAD_8TO16_2(pix - 2*stride);
+ __m128i p0 = MM_LOAD_8TO16_2(pix - stride);
+ __m128i q0 = MM_LOAD_8TO16_2(pix);
+ __m128i q1 = MM_LOAD_8TO16_2(pix + stride);
+ __m128i q2 = MM_LOAD_8TO16_2(pix + 2*stride);
+ __m128i q3 = MM_LOAD_8TO16_2(pix + 3*stride);
+ __m128i zero = _mm_setzero_si128();
+ __m128i _alpha = _mm_set1_epi16((short)alpha);
+ __m128i _quarteralpha = _mm_set1_epi16((short)((alpha >> 2) + 2));
+ __m128i _beta = _mm_set1_epi16((short)beta);
+ __m128i ap_less_beta;
+ __m128i aq_less_beta;
+ __m128i str;
+ __m128i pq;
+ __m128i short_p;
+ __m128i short_q;
+ __m128i long_p;
+ __m128i long_q;
+ __m128i t;
+ __m128i p0q0_less__quarteralpha;
+
+ __m128i absdif_p0_q0 = subabs128_16(p0, q0);
+ __m128i p0_plus_q0 = _mm_add_epi16(_mm_add_epi16(p0, q0), _mm_set1_epi16(2));
+
+ // if (abs_p0_q0 < alpha && abs_p1_p0 < beta && abs_q1_q0 < beta)
+ str = _mm_cmplt_epi16(absdif_p0_q0, _alpha);
+ //str = _mm_and_si128(str, _mm_cmplt_epi16(subabs128_16(p1, p0), _beta));
+ //str = _mm_and_si128(str, _mm_cmplt_epi16(subabs128_16(q1, q0), _beta));
+ str = _mm_and_si128(str, _mm_cmplt_epi16(_mm_max_epi16(subabs128_16(p1, p0), subabs128_16(q1, q0)), _beta));
+ p0q0_less__quarteralpha = _mm_and_si128(_mm_cmplt_epi16(absdif_p0_q0, _quarteralpha), str);
+
+ //int short_p = (2*p1 + p0 + q1 + 2);
+ //int short_q = (2*q1 + q0 + p1 + 2);
+ short_p = _mm_avg_epu8(_mm_avg_epu8(p0, q1),p1);
+ pq = _mm_add_epi16(_mm_add_epi16(p1, q1), _mm_set1_epi16(2));
+ short_p = _mm_add_epi16(_mm_add_epi16(pq, p1), p0);
+ short_q = _mm_add_epi16(_mm_add_epi16(pq, q1), q0);
+
+ ap_less_beta = _mm_and_si128(_mm_cmplt_epi16(subabs128_16(p2, p0), _beta), p0q0_less__quarteralpha);
+ t = _mm_add_epi16(_mm_add_epi16(p2, p1), p0_plus_q0);
+ // short_p += t - p1 + q0;
+ long_p = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(short_p, t), p1), q0), 1);
+
+ _mm_storel_epi64((__m128i*)(pix - 2*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, _mm_srai_epi16(t, 2)), _mm_andnot_si128(ap_less_beta, p1)), zero));
+ t = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(p3, p2), 1), t), _mm_set1_epi16(2));
+ _mm_storel_epi64((__m128i*)(pix - 3*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, _mm_srai_epi16(t, 3)), _mm_andnot_si128(ap_less_beta, p2)), zero));
+
+ aq_less_beta = _mm_and_si128(_mm_cmplt_epi16(subabs128_16(q2, q0), _beta), p0q0_less__quarteralpha);
+ t = _mm_add_epi16(_mm_add_epi16(q2, q1), p0_plus_q0);
+ long_q = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(short_q, t), q1), p0), 1);
+ _mm_storel_epi64((__m128i*)(pix + 1*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, _mm_srai_epi16(t, 2)), _mm_andnot_si128(aq_less_beta, q1)), zero));
+
+ t = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(q3, q2), 1), t), _mm_set1_epi16(2));
+ _mm_storel_epi64((__m128i*)(pix + 2*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, _mm_srai_epi16(t, 3)), _mm_andnot_si128(aq_less_beta, q2)), zero));
+
+ short_p = _mm_srai_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, long_p), _mm_andnot_si128(ap_less_beta, short_p)), 2);
+ short_q = _mm_srai_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, long_q), _mm_andnot_si128(aq_less_beta, short_q)), 2);
+
+ _mm_storel_epi64((__m128i*)(pix - stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(str, short_p), _mm_andnot_si128(str, p0)), zero));
+ _mm_storel_epi64((__m128i*)(pix ), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(str, short_q), _mm_andnot_si128(str, q0)), zero));
+
+ pix += 8;
+ } while (--ccloop);
+}
+
+static void deblock_luma_v_s4_sse(uint8_t *pix, int stride, int alpha, int beta)
+{
+ __m128i scratch[8];
+ uint8_t *s = pix - 4;
+ uint8_t *dst = (uint8_t *)scratch;
+ int cloop = 2;
+ do
+ {
+ transpose8x8_sse(dst, 16, s, stride);
+ s += 8*stride;
+ dst += 8;
+ } while(--cloop);
+
+ deblock_luma_h_s4_sse((uint8_t *)(scratch+4), 16, alpha, beta);
+ s = pix - 4;
+ dst = (uint8_t *)scratch;
+ cloop = 2;
+ do
+ {
+ transpose8x8_sse(s, stride, dst, 16);
+ s += 8*stride;
+ dst += 8;
+ } while(--cloop);
+}
+
+// (a-b) >> 1s == ((a + ~b + 1) >> 1u) - 128;
+//
+// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3 =
+// (4*q0 - 4*p0 + p1 - q1 + 4) >> 3 =
+// ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4) >> 3
+// ((p1-p0) - (q1-q0) - 3*p0 + 3*q0) + 4) >> 3
+// (((p1-p0)-p0)>>1 - ((q1-q0)-q0)>>1 - p0 + q0) + 2) >> 2
+// ((((p1-p0)-p0)>>1 - p0)>>1 - (((q1-q0)-q0)>>1 - q0)>>1) + 1) >> 1
+static void deblock_luma_h_s3_sse(uint8_t *h264e_restrict pix, int stride, int alpha, int beta, const void* threshold, uint32_t strength)
+{
+ __m128i p1 = _mm_loadu_si128((__m128i *)(pix - 2*stride));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(pix - stride));
+ __m128i q0 = _mm_loadu_si128((__m128i *)pix);
+ __m128i q1 = _mm_loadu_si128((__m128i *)(pix + stride));
+ __m128i maskp, maskq, zeromask, thr;
+ __m128i tc0tmp, p2, q2, p0q0avg, _beta;
+
+#define HALFSUM(x, y) _mm_sub_epi8(_mm_avg_epu8(x, y), _mm_and_si128(_mm_xor_si128(y, x), _mm_set1_epi8(1)))
+
+ // if (ABS(p0-q0) - alpha) ...
+ zeromask = _mm_subs_epu8(subabs128(p0, q0), _mm_set1_epi8((int8_t)(alpha - 1)));
+ // & (ABS(p1-p0) - beta) & (ABS(q1-q0) - beta)
+ _beta = _mm_set1_epi8((int8_t)(beta - 1));
+ zeromask = _mm_or_si128(zeromask, _mm_subs_epu8(_mm_max_epu8(subabs128(p1, p0), subabs128(q1, q0)), _beta));
+ zeromask = _mm_cmpeq_epi8(zeromask, _mm_setzero_si128());
+
+ {
+ __m128i str_x = _mm_cvtsi32_si128(strength);
+ str_x = _mm_unpacklo_epi8(str_x, str_x);
+ str_x = _mm_cmpgt_epi8(_mm_unpacklo_epi8(str_x, str_x), _mm_setzero_si128());
+ zeromask = _mm_and_si128(zeromask, str_x);
+ }
+
+ thr = _mm_cvtsi32_si128(*(int*)threshold);//_mm_loadl_epi64((__m128i *)(threshold));
+ thr = _mm_unpacklo_epi8(thr, thr);
+ thr = _mm_unpacklo_epi8(thr, thr);
+ thr = _mm_and_si128(thr, zeromask);
+
+ p2 = _mm_loadu_si128((__m128i *)(pix - 3*stride));
+ maskp = CMP_BETA(p2, p0, _beta);
+ tc0tmp = _mm_and_si128(thr, maskp);
+ p0q0avg = _mm_avg_epu8(p0, q0); // (p0+q0+1)>>1
+ _mm_storeu_si128((__m128i *)(pix - 2*stride), _mm_min_epu8(_mm_max_epu8(HALFSUM(p2, p0q0avg), _mm_subs_epu8(p1, tc0tmp)), _mm_adds_epu8(p1, tc0tmp)));
+
+ q2 = _mm_loadu_si128((__m128i *)(pix + 2*stride));
+ maskq = CMP_BETA(q2, q0, _beta);
+ tc0tmp = _mm_and_si128(thr, maskq);
+ _mm_storeu_si128((__m128i *)(pix + stride), _mm_min_epu8(_mm_max_epu8(HALFSUM(q2, p0q0avg), _mm_subs_epu8(q1, tc0tmp)), _mm_adds_epu8(q1, tc0tmp)));
+
+ thr = _mm_sub_epi8(thr, maskp);
+ thr = _mm_sub_epi8(thr, maskq);
+ thr = _mm_and_si128(thr, zeromask);
+
+ {
+ __m128i ff = _mm_set1_epi8(0xff);
+ __m128i part1 = _mm_avg_epu8(q0, _mm_xor_si128(p0, ff));
+ __m128i part2 = _mm_avg_epu8(p1, _mm_xor_si128(q1, ff));
+ __m128i carry = _mm_and_si128(_mm_xor_si128(p0, q0), _mm_set1_epi8(1));
+ __m128i d = _mm_adds_epu8(part1, _mm_avg_epu8(_mm_avg_epu8(part2, _mm_set1_epi8(3)), carry));
+ __m128i delta_p = _mm_subs_epu8(d, _mm_set1_epi8((char)(128 + 33)));
+ __m128i delta_n = _mm_subs_epu8(_mm_set1_epi8((char)(128 + 33)), d);
+ delta_p = _mm_min_epu8(delta_p, thr);
+ delta_n = _mm_min_epu8(delta_n, thr);
+
+ q0 = _mm_adds_epu8(_mm_subs_epu8(q0, delta_p), delta_n);
+ p0 = _mm_subs_epu8(_mm_adds_epu8(p0, delta_p), delta_n);
+
+ _mm_storeu_si128 ((__m128i *)(pix - stride), p0);
+ _mm_storeu_si128 ((__m128i *)pix, q0);
+ }
+}
+
+static void deblock_luma_v_s3_sse(uint8_t *pix, int stride, int alpha, int beta, const void* thr, uint32_t strength)
+{
+ __m128i scratch[8];
+ uint8_t *s = pix - 4;
+ uint8_t *dst = (uint8_t *)scratch;
+ int cloop = 2;
+ do
+ {
+ transpose8x8_sse(dst, 16, s, stride);
+ s += 8*stride;
+ dst += 8;
+ } while(--cloop);
+
+ deblock_luma_h_s3_sse((uint8_t*)(scratch + 4), 16, alpha, beta, thr, strength);
+ s = pix - 4;
+ dst = (uint8_t *)scratch;
+ cloop = 2;
+ do
+ {
+ transpose8x8_sse(s, stride, dst, 16);
+ s += 8*stride;
+ dst += 8;
+ } while(--cloop);
+}
+
+static void h264e_deblock_chroma_sse2(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a, b, x, y;
+ a = alpha[0];
+ b = beta[0];
+ for (x = 0; x < 16; x += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if (str && a)
+ {
+ deblock_chroma_v_s4_sse(pix + (x >> 1), stride, thr + x, a, b, str);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ thr += 16;
+ strength += 16;
+ a = alpha[2];
+ b = beta[2];
+ for (y = 0; y < 16; y += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if (str && a)
+ {
+ deblock_chroma_h_s4_sse(pix, stride, thr + y, a, b, str);
+ }
+ pix += 4*stride;
+ a = alpha[3];
+ b = beta[3];
+ }
+}
+
+static void h264e_deblock_luma_sse2(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a, b, x, y;
+ a = alpha[0];
+ b = beta[0];
+ for (x = 0; x < 16; x += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_v_s4_sse(pix + x, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_v_s3_sse(pix + x, stride, a, b, thr + x, str);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ thr += 16;
+ strength += 16;
+ a = alpha[2];
+ b = beta[2];
+ for (y = 0; y < 16; y += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_h_s4_sse(pix, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_h_s3_sse(pix, stride, a, b, thr + y, str);
+ }
+ a = alpha[3];
+ b = beta[3];
+ pix += 4*stride;
+ }
+}
+
+static void h264e_denoise_run_sse2(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+#define MM_LOAD_8TO16(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), zero)
+ int cloop, h = h_arg;
+ __m128i zero = _mm_setzero_si128();
+ __m128i exp = _mm_set1_epi32(0x7F800000);
+
+ w -= 2;
+ h -= 2;
+ if (w <= 2 || h <= 2)
+ {
+ return;
+ }
+
+ do
+ {
+ unsigned char *pf = frm += stride_frm;
+ unsigned char *pp = frmprev += stride_frmprev;
+ cloop = w >> 3;
+ pp[-stride_frmprev] = *pf++;
+ pp++;
+
+ while (cloop--)
+ {
+ __m128 float_val;
+ __m128i log_neighbour, log_d;
+ __m128i log_neighbour_h, log_neighbour_l, log_d_h, log_d_l;
+ __m128i a, b;
+ __m128i gain;
+ __m128i abs_d, abs_neighbour;
+ a = MM_LOAD_8TO16(pf);
+ b = MM_LOAD_8TO16(pp);
+ abs_d = _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+ a = MM_LOAD_8TO16(pf-stride_frm);
+ a = _mm_add_epi16(a, MM_LOAD_8TO16(pf - 1));
+ a = _mm_add_epi16(a, MM_LOAD_8TO16(pf + 1));
+ a = _mm_add_epi16(a, MM_LOAD_8TO16(pf + stride_frm));
+ b = MM_LOAD_8TO16(pp-stride_frmprev);
+ b = _mm_add_epi16(b, MM_LOAD_8TO16(pp - 1));
+ b = _mm_add_epi16(b, MM_LOAD_8TO16(pp + 1));
+ b = _mm_add_epi16(b, MM_LOAD_8TO16(pp + stride_frmprev));
+
+ abs_neighbour = _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+
+ abs_neighbour = _mm_srai_epi16(abs_neighbour, 2);
+
+ abs_d = _mm_add_epi16(abs_d, _mm_set1_epi16(1));
+ abs_neighbour = _mm_add_epi16(abs_neighbour, _mm_set1_epi16(1));
+
+ float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(abs_neighbour, zero), 16), 16));
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ log_neighbour_l = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+ float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(abs_neighbour, zero), 16), 16));
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ log_neighbour_h = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+ float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(abs_d, zero), 16), 16));
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ log_d_l = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+ float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(abs_d, zero), 16), 16));
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ float_val = _mm_mul_ps(float_val, float_val);
+ log_d_h = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+ log_d = _mm_packs_epi32(log_d_l, log_d_h);
+ log_neighbour = _mm_packs_epi32(log_neighbour_l, log_neighbour_h);
+
+ log_neighbour = _mm_slli_epi16(log_neighbour, 8);
+ log_neighbour = _mm_adds_epu16(log_neighbour, log_neighbour);
+ log_neighbour = _mm_adds_epu16(log_neighbour, log_neighbour);
+ log_neighbour = _mm_srli_epi16(log_neighbour, 8);
+
+ log_neighbour = _mm_subs_epu16(_mm_set1_epi16(255), log_neighbour);
+ log_d = _mm_subs_epu16(_mm_set1_epi16(255), log_d);
+
+ gain = _mm_mullo_epi16(log_d, log_neighbour);
+
+ a = MM_LOAD_8TO16(pf);
+ b = MM_LOAD_8TO16(pp);
+{
+ __m128i s;
+ __m128i gain_inv;
+ gain_inv = _mm_sub_epi16(_mm_set1_epi8((char)255), gain);
+ s = _mm_add_epi16(_mm_mulhi_epu16(a, gain_inv), _mm_mulhi_epu16(b, gain));
+ b = _mm_mullo_epi16(b, gain);
+ a = _mm_mullo_epi16(a, gain_inv);
+ a = _mm_sub_epi16(_mm_avg_epu16(a, b), _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi16(1)));
+ a = _mm_avg_epu16(_mm_srli_epi16(a, 14), _mm_set1_epi16(0));
+ a = _mm_add_epi16(a, s);
+ _mm_storel_epi64((__m128i *)(pp-stride_frmprev), _mm_packus_epi16(a,zero));
+}
+ pf += 8;
+ pp += 8;
+ }
+
+ cloop = w & 7;
+ while (cloop--)
+ {
+ int d, neighbourhood;
+ unsigned g, gd, gn, out_val;
+ d = pf[0] - pp[0];
+ neighbourhood = pf[-1] - pp[-1];
+ neighbourhood += pf[+1] - pp[+1];
+ neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+ neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+ if (d < 0)
+ {
+ d = -d;
+ }
+ if (neighbourhood < 0)
+ {
+ neighbourhood = -neighbourhood;
+ }
+ neighbourhood >>= 2;
+
+ gd = g_diff_to_gainQ8[d];
+ gn = g_diff_to_gainQ8[neighbourhood];
+
+ gn <<= 2;
+ if (gn > 255)
+ {
+ gn = 255;
+ }
+
+ gn = 255 - gn;
+ gd = 255 - gd;
+ g = gn*gd; // Q8*Q8 = Q16;
+
+ //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+ out_val = (pp[0]*g + (0xffff-g)*pf[0] + (1<<15)) >> 16;
+
+ assert(out_val <= 255);
+
+ pp[-stride_frmprev] = (unsigned char)out_val;
+
+ pf++, pp++;
+ }
+ pp[-stride_frmprev] = *pf++;
+ } while(--h);
+
+ memcpy(frmprev + stride_frmprev, frm + stride_frm, w+2);
+ h = h_arg - 2;
+ do
+ {
+ memcpy(frmprev, frmprev - stride_frmprev, w+2);
+ frmprev -= stride_frmprev;
+ } while(--h);
+ memcpy(frmprev, frm - stride_frm*(h_arg-2), w+2);
+}
+
+#define IS_NULL(p) ((p) < (pix_t *)(uintptr_t)32)
+
+static uint32_t intra_predict_dc_sse(const pix_t *left, const pix_t *top, int log_side)
+{
+ unsigned dc = 0, side = 1u << log_side, round = 0;
+ __m128i sum = _mm_setzero_si128();
+ if (!IS_NULL(left))
+ {
+ int cloop = side;
+ round += side >> 1;
+ do
+ {
+ sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_cvtsi32_si128(*(int*)left), _mm_setzero_si128()));
+ left += 4;
+ } while (cloop -= 4);
+ }
+ if (!IS_NULL(top))
+ {
+ int cloop = side;
+ round += side >> 1;
+ do
+ {
+ sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_cvtsi32_si128(*(int*)top), _mm_setzero_si128()));
+ top += 4;
+ } while (cloop -= 4);
+ }
+ dc = _mm_cvtsi128_si32(sum);
+ dc += round;
+ if (round == side) dc >>= 1;
+ dc >>= log_side;
+ if (!round) dc = 128;
+ return dc * 0x01010101;
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ * UL U0 U1 U2 U3 U4 U5 U6 U7
+ * L0 xx xx xx xx
+ * L1 xx xx xx xx
+ * L2 xx xx xx xx
+ * L3 xx xx xx xx
+ */
+#define UL edge[-1]
+#define U0 edge[0]
+#define T1 edge[1]
+#define U2 edge[2]
+#define U3 edge[3]
+#define U4 edge[4]
+#define U5 edge[5]
+#define U6 edge[6]
+#define U7 edge[7]
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16_sse2(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 16;
+ if (mode < 1)
+ {
+ __m128i a = _mm_load_si128((__m128i *)top);
+ do
+ {
+ _mm_store_si128((__m128i *)predict, a);
+ predict += 16;
+ } while(--cloop);
+ } else if (mode == 1)
+ {
+ const __m128i c1111 = _mm_set1_epi8(1);
+ do
+ {
+ _mm_store_si128((__m128i *)predict, _mm_shuffle_epi32(_mm_mul_epu32(_mm_cvtsi32_si128(*left++), c1111), 0));
+ predict += 16;
+ } while(--cloop);
+ } else //if (mode == 2)
+ {
+ __m128i dc128;
+ int dc = intra_predict_dc_sse(left, top, 4);
+ dc128 = _mm_shuffle_epi32(_mm_cvtsi32_si128(dc), 0);
+ do
+ {
+ _mm_store_si128((__m128i *)predict, dc128);
+ predict += 16;
+ } while(--cloop);
+ }
+}
+
+static void h264e_intra_predict_chroma_sse2(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 8;
+ if (mode < 1)
+ {
+ __m128i a = _mm_load_si128((__m128i *)top);
+ do
+ {
+ _mm_store_si128((__m128i *)predict, a);
+ predict += 16;
+ } while(--cloop);
+ } else if (mode == 1)
+ {
+ do
+ {
+ __m128i t = _mm_unpacklo_epi32(_mm_cvtsi32_si128(left[0]*0x01010101u), _mm_cvtsi32_si128(left[8]*0x01010101u));
+ t = _mm_unpacklo_epi32(t, t);
+ _mm_store_si128((__m128i *)predict, t);
+ left++;
+ predict += 16;
+ } while(--cloop);
+ } else //if (mode == 2)
+ {
+ // chroma
+ uint32_t *d = (uint32_t*)predict;
+ __m128i *d128 = (__m128i *)predict;
+ __m128i tmp;
+ cloop = 2;
+ do
+ {
+ d[0] = d[1] = d[16] = intra_predict_dc_sse(left, top, 2);
+ d[17] = intra_predict_dc_sse(left + 4, top + 4, 2);
+ if (!IS_NULL(top))
+ {
+ d[1] = intra_predict_dc_sse(NULL, top + 4, 2);
+ }
+ if (!IS_NULL(left))
+ {
+ d[16] = intra_predict_dc_sse(NULL, left + 4, 2);
+ }
+ d += 2;
+ left += 8;
+ top += 8;
+ } while(--cloop);
+ tmp = _mm_load_si128(d128++);
+ _mm_store_si128(d128++, tmp);
+ _mm_store_si128(d128++, tmp);
+ _mm_store_si128(d128++, tmp);
+ tmp = _mm_load_si128(d128++);
+ _mm_store_si128(d128++, tmp);
+ _mm_store_si128(d128++, tmp);
+ _mm_store_si128(d128++, tmp);
+ }
+}
+
+static int h264e_intra_choose_4x4_sse2(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+ int best_m = 0;
+ int sad, best_sad = 0x10000;
+
+ __m128i b0 = _mm_loadl_epi64((__m128i *)blockin);
+ __m128i b1 = _mm_loadl_epi64((__m128i *)(blockin + 16));
+ __m128i b2 = _mm_loadl_epi64((__m128i *)(blockin + 32));
+ __m128i b3 = _mm_loadl_epi64((__m128i *)(blockin + 48));
+ __m128i c = _mm_unpacklo_epi32(b0, b1);
+ __m128i d = _mm_unpacklo_epi32(b2, b3);
+ __m128i sse_blockin = _mm_unpacklo_epi64(c, d);
+ __m128i t, t0, t1, t2, res, sad128, best128;
+
+#define TEST(mode) sad128 = _mm_sad_epu8(res, sse_blockin); \
+ sad128 = _mm_adds_epu16 (sad128, _mm_shuffle_epi32(sad128, 2)); \
+ sad = _mm_cvtsi128_si32(sad128); \
+ if (mode != mpred) sad += penalty; \
+ if (sad < best_sad) \
+ { \
+ best128 = res; \
+ best_sad = sad; \
+ best_m = mode; \
+ }
+
+ __m128i border = _mm_loadu_si128((__m128i *)(&L3));
+ int topright = 0x01010101u*U7;
+
+ if (!(avail & AVAIL_TR))
+ {
+ topright = 0x01010101u*U3;
+ //border = _mm_insert_epi32 (border, topright, 2);
+ border = _mm_insert_epi16 (border, topright, 4);
+ border = _mm_insert_epi16 (border, topright, 5);
+ }
+ //border = _mm_insert_epi32 (border, topright, 3);
+ border = _mm_insert_epi16 (border, topright, 6);
+ border = _mm_insert_epi16 (border, topright, 7);
+
+ // DC
+ {
+ unsigned dc = 0, round = 0;
+
+ if (avail & AVAIL_L)
+ {
+ dc += _mm_cvtsi128_si32(_mm_sad_epu8(_mm_and_si128(border, _mm_set_epi32(0, 0, 0, ~0)), _mm_setzero_si128()));
+ round += 2;
+ }
+ if (avail & AVAIL_T)
+ {
+ dc += _mm_cvtsi128_si32(_mm_sad_epu8(_mm_and_si128(_mm_srli_si128(border, 5), _mm_set_epi32(0, 0, 0, ~0)), _mm_setzero_si128()));
+ round += 2;
+ }
+ dc += round;
+ if (round == 4) dc >>= 1;
+ dc >>= 2;
+ if (!round) dc = 128;
+ t = _mm_cvtsi32_si128(dc * 0x01010101);
+ t = _mm_unpacklo_epi32(t, t);
+ best128 =_mm_unpacklo_epi32(t, t);
+
+ //TEST(2)
+ sad128 = _mm_sad_epu8(best128, sse_blockin);
+ sad128 = _mm_adds_epu16 (sad128, _mm_shuffle_epi32(sad128, 2));
+ best_sad = _mm_cvtsi128_si32(sad128);
+
+ if (2 != mpred) best_sad += penalty;
+ best_m = 2;
+ }
+
+ if (avail & AVAIL_T)
+ {
+ t = _mm_srli_si128(border, 5);
+ t = _mm_unpacklo_epi32(t, t);
+ res = _mm_unpacklo_epi32(t, t);
+ TEST(0)
+
+ t0 = _mm_srli_si128(border, 5);
+ t1 = _mm_srli_si128(border, 6);
+ t2 = _mm_srli_si128(border, 7);
+ t = _mm_sub_epi8(_mm_avg_epu8(t0, t2), _mm_and_si128(_mm_xor_si128(t0, t2), _mm_set1_epi8(1)));
+ t = _mm_avg_epu8(t, t1);
+ t2 = _mm_unpacklo_epi32(t, _mm_srli_si128(t, 1));
+
+ res = _mm_unpacklo_epi64(t2, _mm_unpacklo_epi32(_mm_srli_si128(t, 2), _mm_srli_si128(t, 3)));
+ TEST(3)
+
+ t0 = _mm_avg_epu8(t0,t1);
+ t0 = _mm_unpacklo_epi32(t0, _mm_srli_si128(t0, 1));
+ res = _mm_unpacklo_epi32(t0, t2);
+ TEST(7)
+ }
+
+ if (avail & AVAIL_L)
+ {
+ int ext;
+ t = _mm_unpacklo_epi8(border, border);
+ t = _mm_shufflelo_epi16(t, 3 + (2 << 2) + (1 << 4) + (0 << 6));
+ res = _mm_unpacklo_epi8(t, t);
+ TEST(1)
+
+ t0 = _mm_unpacklo_epi8(border, _mm_setzero_si128());
+ t0 = _mm_shufflelo_epi16(t0, 3 + (2 << 2) + (1 << 4) + (0 << 6));
+ t0 = _mm_packus_epi16(t0, t0); // 0 1 2 3
+
+ t1 = _mm_unpacklo_epi8(t0, t0); // 0 0 1 1 2 2 3 3
+
+ ext = _mm_extract_epi16(t1, 3);
+ t0 = _mm_insert_epi16 (t0, ext, 2); // 0 1 2 3 3 3
+ t1 = _mm_insert_epi16 (t1, ext, 4); // 0 0 1 1 2 2 3 3 33
+ t2 = _mm_slli_si128(t0, 2); // x x 0 1 2 3 3 3
+ t = _mm_sub_epi8(_mm_avg_epu8(t0, t2), _mm_and_si128(_mm_xor_si128(t0, t2), _mm_set1_epi8(1)));
+ // 0 1 2 3 3 3
+ // x x 0 1 2 3
+ t = _mm_unpacklo_epi8(t2, t);
+ // 0 1 2 3 3 3
+ // x x 0 1 2 3
+ // x x 0 1 2 3
+ t = _mm_avg_epu8(t, _mm_slli_si128(t1, 2));
+ // 0 0 1 1 2 2 3 3
+
+ res = _mm_unpacklo_epi32(_mm_srli_si128(t, 4), _mm_srli_si128(t, 6));
+ //res = _mm_insert_epi32 (res, ext|(ext<<16),3);
+ res = _mm_insert_epi16 (res, ext, 6);
+ res = _mm_insert_epi16 (res, ext, 7);
+ TEST(8)
+ }
+
+ if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+ {
+ int t16;
+ t0 = _mm_srli_si128(border, 1);
+ t1 = _mm_srli_si128(border, 2);
+ t = _mm_sub_epi8(_mm_avg_epu8(border, t1), _mm_and_si128(_mm_xor_si128(border, t1), _mm_set1_epi8(1)));
+ t = _mm_avg_epu8(t, t0);
+
+ res = _mm_unpacklo_epi64(_mm_unpacklo_epi32(_mm_srli_si128(t, 3), _mm_srli_si128(t, 2)), _mm_unpacklo_epi32(_mm_srli_si128(t, 1), t));
+ TEST(4)
+
+ t1 = _mm_unpacklo_epi8(t2 = _mm_avg_epu8(t0,border), t);
+ t1 = _mm_unpacklo_epi32(t1, _mm_srli_si128(t1, 2));
+ res = _mm_shuffle_epi32(t1, 3 | (2 << 2) | (1 << 4) | (0 << 6));
+ res = _mm_insert_epi16 (res, _mm_extract_epi16 (t, 2), 1);
+ TEST(6)
+
+ t = _mm_srli_si128(t, 1);
+ res = _mm_unpacklo_epi32(_mm_srli_si128(t2, 4), _mm_srli_si128(t, 2));
+ t2 = _mm_insert_epi16 (t2, t16 = _mm_extract_epi16 (t, 0), 1);
+ t = _mm_insert_epi16 (t, (t16 << 8), 0);
+ res = _mm_unpacklo_epi64(res, _mm_unpacklo_epi32(_mm_srli_si128(t2, 3), _mm_srli_si128(t, 1)));
+ TEST(5)
+ }
+
+ ((uint32_t *)blockpred)[ 0] = _mm_extract_epi16(best128, 0) | ((unsigned)_mm_extract_epi16(best128, 1) << 16);
+ ((uint32_t *)blockpred)[ 4] = _mm_extract_epi16(best128, 2) | ((unsigned)_mm_extract_epi16(best128, 3) << 16);
+ ((uint32_t *)blockpred)[ 8] = _mm_extract_epi16(best128, 4) | ((unsigned)_mm_extract_epi16(best128, 5) << 16);
+ ((uint32_t *)blockpred)[12] = _mm_extract_epi16(best128, 6) | ((unsigned)_mm_extract_epi16(best128, 7) << 16);
+
+ return best_m + (best_sad << 4); // pack result
+}
+
+#define MM_LOAD_8TO16(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), zero)
+#define MM_LOAD_REG(p, sh) _mm_unpacklo_epi8(_mm_srli_si128(p, sh), zero)
+#define __inline
+static __inline void copy_wh_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ assert(h % 4 == 0);
+ if (w == 16)
+ {
+ do
+ {
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+ } while(h -= 8);
+ } else //if (w == 8)
+ {
+ do
+ {
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ } while(h -= 4);
+ }
+}
+
+static __inline void hpel_lpf_diag_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16); /* 21 rows by 16 pixels per row */
+
+ /*
+ * Intermediate values will be 1/2 pel at Horizontal direction
+ * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+ * scratch contains a 2D array of size (w)X(h + 5)
+ */
+ __m128i zero = _mm_setzero_si128();
+ __m128i c32,c5 = _mm_set1_epi16(5);
+ int cloop = h + 5;
+ int16_t *h264e_restrict dst16 = scratch;
+ const int16_t *src16 = scratch + 2*16;
+ src -= 2*src_stride;
+ if (w == 8)
+ {
+ src16 = scratch + 2*8;
+ do
+ {
+ __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+ _mm_store_si128((__m128i*)dst16, _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(
+ _mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)),
+ 2),
+ _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+ c5),
+ _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))
+ ));
+ src += src_stride;
+ dst16 += 8;
+ } while (--cloop);
+
+ c32 = _mm_set1_epi16(32);
+ cloop = h;
+ do
+ {
+ // (20*x2 - 5*x1 + x0 + 512) >> 10 =>
+ // (16*x2 + 4*x2 - 4*x1 - x1 + x0 + 512) >> 10 =>
+ // ((((x0 - x1) >> 2) + (x2 - x1)) >> 2) + x2 + 32 >> 6
+ __m128i x1 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 1*8)), _mm_load_si128((__m128i*)(src16 + 2*8)));
+ __m128i x2 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 0*8)), _mm_load_si128((__m128i*)(src16 + 1*8)));
+ _mm_storel_epi64((__m128i*)dst,
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_srai_epi16(
+ _mm_sub_epi16(
+ _mm_srai_epi16(
+ _mm_sub_epi16(
+ _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 2*8)), _mm_load_si128((__m128i*)(src16 + 3*8))),
+ x1),
+ 2),
+ _mm_sub_epi16(x1, x2)),
+ 2),
+ _mm_add_epi16(x2, c32)),
+ 6),
+ zero));
+ src16 += 8;
+ dst += 16;
+ } while(--cloop);
+ } else
+ {
+ do
+ {
+ _mm_store_si128((__m128i*)dst16, _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(
+ _mm_add_epi16(MM_LOAD_8TO16(src - 0), MM_LOAD_8TO16(src + 1)),
+ 2),
+ _mm_add_epi16(MM_LOAD_8TO16(src - 1), MM_LOAD_8TO16(src + 2))),
+ c5),
+ _mm_add_epi16(MM_LOAD_8TO16(src - 2), MM_LOAD_8TO16(src + 3))
+ ));
+ _mm_store_si128((__m128i*)(dst16 + 8), _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(
+ _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 0), MM_LOAD_8TO16(src + 8 + 1)),
+ 2),
+ _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 1), MM_LOAD_8TO16(src + 8 + 2))),
+ c5),
+ _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 2), MM_LOAD_8TO16(src + 8 + 3))
+ ));
+ src += src_stride;
+ dst16 += 8*2;
+ } while (--cloop);
+
+ c32 = _mm_set1_epi16(32);
+ cloop = 2*h;
+ do
+ {
+ // (20*x2 - 5*x1 + x0 + 512) >> 10 =>
+ // (16*x2 + 4*x2 - 4*x1 - x1 + x0 + 512) >> 10 =>
+ // ((((x0 - x1) >> 2) + (x2 - x1)) >> 2) + x2 + 32 >> 6
+ __m128i x1 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 1*16)), _mm_load_si128((__m128i*)(src16 + 2*16)));
+ __m128i x2 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 0*16)), _mm_load_si128((__m128i*)(src16 + 1*16)));
+ _mm_storel_epi64((__m128i*)dst,
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_srai_epi16(
+ _mm_sub_epi16(
+ _mm_srai_epi16(
+ _mm_sub_epi16(
+ _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 2*16)), _mm_load_si128((__m128i*)(src16 + 3*16))),
+ x1),
+ 2),
+ _mm_sub_epi16(x1, x2)),
+ 2),
+ _mm_add_epi16(x2, c32)),
+ 6),
+ zero));
+ src16 += 8;
+ dst += 8;
+ } while(--cloop);
+ }
+}
+
+static __inline void hpel_lpf_hor_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ __m128i zero = _mm_setzero_si128();
+ const __m128i five = _mm_set1_epi16(5);
+ if (w == 8)
+ {
+ do
+ {
+ __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+ _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+ _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+ five),
+ _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+ _mm_set1_epi16(16)),
+ 5),
+ zero));
+ src += src_stride;
+ dst += 16;
+ } while (--h);
+ } else do
+ {
+ __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+ _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+ _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+ five),
+ _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+ _mm_set1_epi16(16)),
+ 5),
+ zero));
+ inp = _mm_loadu_si128((__m128i*)(src + 8 - 2));
+ _mm_storel_epi64((__m128i*)(dst + 8), _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+ _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+ five),
+ _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+ _mm_set1_epi16(16)),
+ 5),
+ zero));
+ src += src_stride;
+ dst += 16;
+ } while (--h);
+}
+
+static __inline void hpel_lpf_ver_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ __m128i zero = _mm_setzero_si128();
+ __m128i five = _mm_set1_epi16(5);
+ __m128i const16 = _mm_set1_epi16(16);
+
+ do
+ {
+ int cloop = h;
+ do
+ {
+ _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(
+ _mm_sub_epi16(
+ _mm_slli_epi16(_mm_add_epi16(MM_LOAD_8TO16(src - 0*src_stride), MM_LOAD_8TO16(src + 1*src_stride)), 2),
+ _mm_add_epi16(MM_LOAD_8TO16(src - 1*src_stride), MM_LOAD_8TO16(src + 2*src_stride))),
+ five),
+ _mm_add_epi16(MM_LOAD_8TO16(src - 2*src_stride), MM_LOAD_8TO16(src + 3*src_stride))),
+ const16),
+ 5),
+ zero));
+ src += src_stride;
+ dst += 16;
+ } while(--cloop);
+ src += 8 - src_stride*h;
+ dst += 8 - 16*h;
+ } while ((w -= 8) > 0);
+}
+
+static void average_16x16_unalign_sse(uint8_t *dst, const uint8_t *src, int src_stride)
+{
+ __m128i *d = (__m128i *)dst;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+ _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+}
+
+static void h264e_qpel_average_wh_align_sse2(const uint8_t *src0, const uint8_t *src1, uint8_t *h264e_restrict dst, point_t wh)
+{
+ int w = wh.s.x;
+ int h = wh.s.y;
+ __m128i *d = (__m128i *)dst;
+ const __m128i *s0 = (const __m128i *)src0;
+ const __m128i *s1 = (const __m128i *)src1;
+ if (w == 16)
+ {
+ do
+ {
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+ } while((h -= 8) > 0);
+ } else
+ {
+ do
+ {
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+ } while((h -= 8) > 0);
+ }
+}
+
+static void h264e_qpel_interpolate_luma_sse2(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+ ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+// src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride; // dx == 3 ? next row; dy == 3 ? next line
+// dxdy actions: Horizontal, Vertical, Diagonal, Average
+// 0 1 2 3 +1 - ha h ha+
+// 1 va hva hda hv+a
+// 2 v vda d v+da
+// 3 va+ h+va h+da h+v+a
+// +stride
+ int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+ uint8_t *h264e_restrict dst0 = dst;
+
+ if (pos == 1)
+ {
+ copy_wh_sse(src, src_stride, dst, wh.s.x, wh.s.y);
+ return;
+ }
+ if (pos & 0xe0ee) // 1110 0000 1110 1110
+ {
+ hpel_lpf_hor_sse(src + ((dxdy.s.y + 1) >> 2)*src_stride, src_stride, dst, wh.s.x, wh.s.y);
+ dst = scratch;
+ }
+ if (pos & 0xbbb0) // 1011 1011 1011 0000
+ {
+ hpel_lpf_ver_sse(src + ((dxdy.s.x + 1) >> 2), src_stride, dst, wh.s.x, wh.s.y);
+ dst = scratch;
+ }
+ if (pos & 0x4e40) // 0100 1110 0100 0000
+ {
+ hpel_lpf_diag_sse(src, src_stride, dst, wh.s.x, wh.s.y);
+ dst = scratch;
+ }
+ if (pos & 0xfafa) // 1111 1010 1111 1010
+ {
+ assert(wh.s.x == 16 && wh.s.y == 16);
+ if (pos & 0xeae0)// 1110 1010 1110 0000
+ {
+ point_t p;
+ p.u32 = 16 + (16 << 16);
+ h264e_qpel_average_wh_align_sse2(scratch, dst0, dst0, p);
+ } else
+ {
+ src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+ average_16x16_unalign_sse(dst0, src, src_stride);
+ }
+ }
+}
+
+static void h264e_qpel_interpolate_chroma_sse2(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+ __m128i zero = _mm_setzero_si128();
+ int w = wh.s.x;
+ int h = wh.s.y;
+ __m128i a, b, c, d;
+
+// __m128i a = _mm_set1_epi16((short)((8-dx) * (8-dy)));
+// __m128i b = _mm_set1_epi16((short)(dx * (8-dy)));
+// __m128i c = _mm_set1_epi16((short)((8-dx) * dy));
+// __m128i d = _mm_set1_epi16((short)(dx * dy));
+ __m128i c8 = _mm_set1_epi16(8);
+ __m128i y,x = _mm_cvtsi32_si128(dxdy.u32);
+ x = _mm_unpacklo_epi16(x, x);
+ x = _mm_unpacklo_epi32(x, x);
+ y = _mm_unpackhi_epi64(x, x);
+ x = _mm_unpacklo_epi64(x, x);
+ a = _mm_mullo_epi16(_mm_sub_epi16(c8, x), _mm_sub_epi16(c8, y));
+ b = _mm_mullo_epi16(x, _mm_sub_epi16(c8, y));
+ c = _mm_mullo_epi16(_mm_sub_epi16(c8, x), y);
+ d = _mm_mullo_epi16(x, y);
+
+ if (!dxdy.u32)
+ {
+ // 10%
+ if (w == 8) do
+ {
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+ } while(h -= 4);
+ else
+ {
+ do
+ {
+ *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+ *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+ *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+ *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+ } while(h -= 4);
+ }
+ } else
+ if (!dxdy.s.x || !dxdy.s.y)
+ {
+ // 40%
+ int dsrc = dxdy.s.x?1:src_stride;
+ c = _mm_or_si128(c,b);
+
+ if (w==8)
+ {
+ do
+ {
+ _mm_storel_epi64((__m128i *)dst,
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(a, MM_LOAD_8TO16(src)),
+ _mm_mullo_epi16(c, MM_LOAD_8TO16(src + dsrc))),
+ _mm_set1_epi16(32)),
+ 6),
+ zero)) ;
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ } else
+ {
+ do
+ {
+ *(int* )(dst) = _mm_cvtsi128_si32 (
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(a, MM_LOAD_8TO16(src)),
+ _mm_mullo_epi16(c, MM_LOAD_8TO16(src + dsrc))),
+ _mm_set1_epi16(32)),
+ 6),
+ zero));
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ }
+ } else
+ {
+ // 50%
+ if (w == 8)
+ {
+ __m128i x1,x0;
+ x0 = _mm_loadl_epi64((__m128i*)(src));
+ x1 = _mm_loadl_epi64((__m128i*)(src + 1));
+ x0 = _mm_unpacklo_epi8(x0, zero);
+ x1 = _mm_unpacklo_epi8(x1, zero);
+ do
+ {
+ __m128i y0, y1;
+ src += src_stride;
+ y0 = _mm_loadl_epi64((__m128i*)(src));
+ y1 = _mm_loadl_epi64((__m128i*)(src + 1));
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ _mm_storel_epi64((__m128i *)dst,
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(x0, a),
+ _mm_mullo_epi16(x1, b)),
+ _mm_add_epi16(
+ _mm_mullo_epi16(y0, c),
+ _mm_mullo_epi16(y1, d))),
+ _mm_set1_epi16(32)),
+ 6),
+ zero));
+ x0 = y0;
+ x1 = y1;
+ dst += 16;
+ } while (--h);
+ } else
+ {
+ // TODO: load 32!
+ __m128i x1, x0 = MM_LOAD_8TO16(src);
+ do
+ {
+ src += src_stride;
+ x1 = MM_LOAD_8TO16(src);
+ *(int*)(dst) = _mm_cvtsi128_si32(
+ _mm_packus_epi16(
+ _mm_srai_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_add_epi16(
+ _mm_mullo_epi16(x0, a),
+ _mm_mullo_epi16(_mm_srli_si128(x0, 2), b)),
+ _mm_add_epi16(
+ _mm_mullo_epi16(x1, c),
+ _mm_mullo_epi16(_mm_srli_si128(x1, 2), d))),
+ _mm_set1_epi16(32)),
+ 6),
+ zero));
+ x0 = x1;
+ dst += 16;
+ } while (--h);
+ }
+ }
+}
+
+static int h264e_sad_mb_unlaign_8x8_sse2(const pix_t *a, int a_stride, const pix_t *b, int sad[4])
+{
+ __m128i *mb = (__m128i *)b;
+ __m128i s01, s23;
+ s01 = _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++)); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+ s23 = _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++)); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+ sad[0] = _mm_cvtsi128_si32(s01);
+ sad[1] = _mm_extract_epi16(s01, 4);
+ sad[2] = _mm_cvtsi128_si32(s23);
+ sad[3] = _mm_extract_epi16(s23, 4);
+ return sad[0] + sad[1] + sad[2] + sad[3];
+}
+
+
+static int h264e_sad_mb_unlaign_wh_sse2(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+ __m128i *mb = (__m128i *)b;
+ __m128i s;
+
+ assert(wh.s.x == 8 || wh.s.x == 16);
+ assert(wh.s.y == 8 || wh.s.y == 16);
+
+ if (wh.s.x == 8)
+ {
+ s = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++)); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+
+ if (wh.s.y == 16)
+ {
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+ }
+ return _mm_extract_epi16 (s, 0);
+ }
+
+ s = _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++)); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+ if (wh.s.y == 16)
+ {
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+ }
+
+ s = _mm_adds_epu16(s, _mm_shuffle_epi32(s, 2));
+ return _mm_cvtsi128_si32(s);
+}
+
+static void h264e_copy_8x8_sse2(pix_t *d, int d_stride, const pix_t *s)
+{
+ assert(IS_ALIGNED(d, 8));
+ assert(IS_ALIGNED(s, 8));
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+ _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s)));
+}
+
+static void h264e_copy_16x16_sse2(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+ assert(IS_ALIGNED(d, 8));
+ assert(IS_ALIGNED(s, 8));
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+ _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s)));
+}
+
+static void h264e_copy_borders_sse2(unsigned char *pic, int w, int h, int guard)
+{
+ int rowbytes = w + 2*guard;
+ int topbot = 2;
+ pix_t *s = pic;
+ pix_t *d = pic - guard*rowbytes;
+ assert(guard == 8 || guard == 16);
+ assert((w % 8) == 0);
+ do
+ {
+ int cloop = w;
+ do
+ {
+ __m128i t = _mm_loadu_si128((__m128i*)(s));
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ if (guard == 16)
+ {
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+ }
+ s += 16;
+ d += 16 - guard*rowbytes;
+ } while((cloop -= 16) > 0);
+ s = pic + (h - 1)*rowbytes;
+ d = s + rowbytes;
+ } while(--topbot);
+
+ {
+ pix_t *s0 = pic - guard*rowbytes;
+ pix_t *s1 = pic - guard*rowbytes + w - 1;
+ int cloop = 2*guard + h;
+ if (guard == 8) do
+ {
+ _mm_storel_epi64((__m128i*)(s0-8), _mm_set1_epi8(*s0));
+ _mm_storel_epi64((__m128i*)(s1+1), _mm_set1_epi8(*s1));
+ s0 += rowbytes;
+ s1 += rowbytes;
+ } while(--cloop); else do
+ {
+ _mm_storeu_si128((__m128i*)(s0-16), _mm_set1_epi8(*s0));
+ _mm_storeu_si128((__m128i*)(s1+1), _mm_set1_epi8(*s1));
+ s0 += rowbytes;
+ s1 += rowbytes;
+ } while(--cloop);
+
+ }
+}
+
+static void hadamar4_2d_sse(int16_t *x)
+{
+ __m128i a = _mm_loadl_epi64((__m128i*)x);
+ __m128i b = _mm_loadl_epi64((__m128i*)(x + 4));
+ __m128i c = _mm_loadl_epi64((__m128i*)(x + 8));
+ __m128i d = _mm_loadl_epi64((__m128i*)(x + 12));
+
+ __m128i u0 = _mm_add_epi16(a, c);
+ __m128i u1 = _mm_sub_epi16(a, c);
+ __m128i u2 = _mm_add_epi16(b, d);
+ __m128i u3 = _mm_sub_epi16(b, d);
+ __m128i v0 = _mm_add_epi16(u0, u2);
+ __m128i v3 = _mm_sub_epi16(u0, u2);
+ __m128i v1 = _mm_add_epi16(u1, u3);
+ __m128i v2 = _mm_sub_epi16(u1, u3);
+
+ // v0: a0 a1 a2 a3
+ // v1: b0 ......
+ // v2: c0 ......
+ // v4: d0 d1 .. d3
+ //
+ __m128i t0 = _mm_unpacklo_epi16(v0, v1); // a0, b0, a1, b1, a2, b2, a3, b3
+ __m128i t2 = _mm_unpacklo_epi16(v2, v3); // c0, d0, c1, d1, c2, d2, c3, d3
+ a = _mm_unpacklo_epi32(t0, t2); // a0, b0, c0, d0, a1, b1, c1, d1
+ c = _mm_unpackhi_epi32(t0, t2); // a2, b2, c2, d2, a3, b3, c3, d3
+ u0 = _mm_add_epi16(a, c); // u0 u2
+ u1 = _mm_sub_epi16(a, c); // u1 u3
+ v0 = _mm_unpacklo_epi64(u0, u1); // u0 u1
+ v1 = _mm_unpackhi_epi64(u0, u1); // u2 u3
+ u0 = _mm_add_epi16(v0, v1); // v0 v1
+ u1 = _mm_sub_epi16(v0, v1); // v3 v2
+
+ v1 = _mm_shuffle_epi32(u1, 0x4e); // u2 u3 01001110
+ _mm_store_si128((__m128i*)x, u0);
+ _mm_store_si128((__m128i*)(x + 8), v1);
+
+}
+
+static void dequant_dc_sse(quant_t *q, int16_t *qval, int dequant, int n)
+{
+ do q++->dq[0] = (int16_t)(*qval++*(int16_t)dequant); while (--n);
+}
+
+static void quant_dc_sse(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+ int r_minus = (1 << 18) - round_q18;
+ do
+ {
+ int v = *qval;
+ int r = v < 0 ? r_minus : round_q18;
+ *deq++ = *qval++ = (v * quant + r) >> 18;
+ } while (--n);
+}
+
+static void hadamar2_2d_sse(int16_t *x)
+{
+ int a = x[0];
+ int b = x[1];
+ int c = x[2];
+ int d = x[3];
+ x[0] = (int16_t)(a + b + c + d);
+ x[1] = (int16_t)(a - b + c - d);
+ x[2] = (int16_t)(a + b - c - d);
+ x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc_sse2(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar4_2d_sse(tmp);
+ quant_dc_sse(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+ hadamar4_2d_sse(tmp);
+ assert(!(qdat[1] & 3));
+ // dirty trick here: shift w/o rounding, since it have no effect for qp >= 10 (or, to be precise, for qp => 9)
+ dequant_dc_sse(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc_sse2(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar2_2d_sse(tmp);
+ quant_dc_sse(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+ hadamar2_2d_sse(tmp);
+ assert(!(qdat[1] & 1));
+ dequant_dc_sse(q, tmp, qdat[1] >> 1, 4);
+ return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+static int is_zero_sse(const int16_t *dat, int i0, const uint16_t *thr)
+{
+ __m128i t = _mm_loadu_si128((__m128i*)(thr));
+ __m128i d = _mm_load_si128((__m128i*)(dat));
+ __m128i z = _mm_setzero_si128();
+ __m128i m, sign;
+ if (i0) d = _mm_insert_epi16 (d, 0, 0);
+
+ sign = _mm_cmpgt_epi16(z, d);
+ d = _mm_sub_epi16(_mm_xor_si128(d, sign), sign);
+
+ m = _mm_cmpgt_epi16(d, t);
+ d = _mm_loadu_si128((__m128i*)(dat + 8));
+ sign = _mm_cmpgt_epi16(z, d);
+ d = _mm_sub_epi16(_mm_xor_si128(d, sign), sign);
+ m = _mm_or_si128(m, _mm_cmpgt_epi16(d, t));
+ return !_mm_movemask_epi8(m);
+}
+
+static int is_zero4_sse(const quant_t *q, int i0, const uint16_t *thr)
+{
+ return is_zero_sse(q[0].dq, i0, thr) &&
+ is_zero_sse(q[1].dq, i0, thr) &&
+ is_zero_sse(q[4].dq, i0, thr) &&
+ is_zero_sse(q[5].dq, i0, thr);
+}
+
+static int h264e_transform_sub_quant_dequant_sse2(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+ int crow = mode >> 1;
+ int ccol = crow;
+ int i, i0 = mode & 1;
+ int nz_block_mask = 0;
+ int zmask = 0;
+ quant_t *q_0 = q;
+
+ int y, x;
+ for (y = 0; y < crow; y++)
+ {
+ for (x = 0; x < ccol; x += 2)
+ {
+ const pix_t *pinp = inp + inp_stride*4*y + 4*x;
+ const pix_t *ppred = pred + 16*4*y + 4*x;
+
+ __m128i d0, d1, d2, d3;
+ __m128i t0, t1, t2, t3;
+ __m128i q0, q1, q2, q3;
+ __m128i zero = _mm_setzero_si128();
+ __m128i inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp), zero);
+ __m128i pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ppred), zero);
+
+ d0 =_mm_sub_epi16(inp8, pred8);
+ pinp += inp_stride;
+ inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp), zero);
+ pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 16)), zero);
+ d1 =_mm_sub_epi16(inp8, pred8);
+ pinp += inp_stride;
+ inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp), zero);
+ pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 32)), zero);
+ d2 =_mm_sub_epi16(inp8, pred8);
+ pinp += inp_stride;
+ inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp), zero);
+ pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 48)), zero);
+ d3 =_mm_sub_epi16(inp8, pred8);
+ t0 = _mm_add_epi16(d0, d3);
+ t1 = _mm_sub_epi16(d0, d3);
+ t2 = _mm_add_epi16(d1, d2);
+ t3 = _mm_sub_epi16(d1, d2);
+ q0 = _mm_add_epi16(t0, t2);
+ q1 = _mm_add_epi16(_mm_add_epi16(t1, t1), t3);
+ q2 = _mm_sub_epi16(t0, t2);
+ q3 = _mm_sub_epi16(t1, _mm_add_epi16(t3, t3));
+
+ // q0: a0 a1 ....... a7
+ // q1: b0 .............
+ // q2: c0 .............
+ // q3: d0 d1 ....... d7
+ //
+ t0 = _mm_unpacklo_epi16(q0, q1); // a0, b0, a1, b1, a2, b2, a3, b3
+ t1 = _mm_unpackhi_epi16(q0, q1); // a4, b4, a5, b5, a6, b6, a7, b7
+ t2 = _mm_unpacklo_epi16(q2, q3); // c0, d0
+ t3 = _mm_unpackhi_epi16(q2, q3); // c4, d4
+
+ q0 = _mm_unpacklo_epi32(t0, t2); // a0, b0, c0, d0, a1, b1, c1, d1
+ q1 = _mm_unpackhi_epi32(t0, t2); // a2, b2,
+ q2 = _mm_unpacklo_epi32(t1, t3); // a4, b4
+ q3 = _mm_unpackhi_epi32(t1, t3); // a6, b6
+
+ d0 = _mm_unpacklo_epi64(q0, q2); // a0, b0, c0, d0, a4, b4, c4, d4
+ d1 = _mm_unpackhi_epi64(q0, q2); // a1, b1, c1, d1
+ d2 = _mm_unpacklo_epi64(q1, q3); // a2, b2,
+ d3 = _mm_unpackhi_epi64(q1, q3); // a3, b3,
+
+ t0 = _mm_add_epi16(d0, d3);
+ t1 = _mm_sub_epi16(d0, d3);
+ t2 = _mm_add_epi16(d1, d2);
+ t3 = _mm_sub_epi16(d1, d2);
+ q0 = _mm_add_epi16(t0, t2);
+ q1 = _mm_add_epi16(_mm_add_epi16(t1, t1), t3);
+ q2 = _mm_sub_epi16(t0, t2);
+ q3 = _mm_sub_epi16(t1, _mm_add_epi16(t3, t3));
+
+ _mm_storel_epi64((__m128i*)(q[0].dq ), q0);
+ _mm_storel_epi64((__m128i*)(q[0].dq + 4), q1);
+ _mm_storel_epi64((__m128i*)(q[0].dq + 8), q2);
+ _mm_storel_epi64((__m128i*)(q[0].dq + 12), q3);
+ if (ccol > 1)
+ {
+ q0 = _mm_unpackhi_epi64(q0, q0); _mm_storel_epi64((__m128i*)(q[1].dq ), q0);
+ q1 = _mm_unpackhi_epi64(q1, q1); _mm_storel_epi64((__m128i*)(q[1].dq + 4), q1);
+ q2 = _mm_unpackhi_epi64(q2, q2); _mm_storel_epi64((__m128i*)(q[1].dq + 8), q2);
+ q3 = _mm_unpackhi_epi64(q3, q3); _mm_storel_epi64((__m128i*)(q[1].dq + 12), q3);
+ }
+ q += 2;
+ }
+ }
+ q = q_0;
+ crow = mode >> 1;
+ ccol = crow;
+
+ if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+ {
+ int cloop = (mode >> 1)*(mode >> 1);
+ short *dc = ((short *)q) - 16;
+ quant_t *pq = q;
+ do
+ {
+ *dc++ = pq->dq[0];
+ pq++;
+ } while (--cloop);
+ }
+
+ if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+ {
+ for (i = 0; i < crow*ccol; i++)
+ {
+ if (is_zero_sse(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+ {
+ zmask |= (1 << i);
+ }
+ }
+
+ if (mode == QDQ_MODE_INTER)
+ {
+ if ((~zmask & 0x0033) && is_zero4_sse(q + 0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+ if ((~zmask & 0x00CC) && is_zero4_sse(q + 2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+ if ((~zmask & 0x3300) && is_zero4_sse(q + 8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+ if ((~zmask & 0xCC00) && is_zero4_sse(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+ }
+ }
+
+ do
+ {
+ do
+ {
+ int nz_mask = 0;
+ if (zmask & 1)
+ {
+ _mm_store_si128((__m128i*)(q->qv), _mm_setzero_si128());
+ _mm_store_si128((__m128i*)(q->qv) + 1, _mm_setzero_si128());
+ } else
+ {
+ int16_t *qv_tmp = q->qv;//[16];
+ __m128i t;
+ const __m128i const_q = _mm_loadu_si128((__m128i*)(qdat + OFFS_QUANT_VECT));
+ const __m128i const_dq = _mm_loadu_si128((__m128i*)(qdat + OFFS_DEQUANT_VECT));
+
+ __m128i src = _mm_load_si128((__m128i*)(q[0].dq));
+ __m128i r = _mm_xor_si128(_mm_set1_epi16(qdat[OFFS_RND_INTER]), _mm_cmpgt_epi16(_mm_setzero_si128(), src));
+ __m128i lo = _mm_mullo_epi16(src, const_q);
+ __m128i hi = _mm_mulhi_epi16(src, const_q);
+ __m128i dst0 = _mm_unpacklo_epi16(lo, hi);
+ __m128i dst1 = _mm_unpackhi_epi16(lo, hi);
+ dst0 = _mm_srai_epi32(_mm_add_epi32(dst0, _mm_unpacklo_epi16(r, _mm_setzero_si128())), 16);
+ dst1 = _mm_srai_epi32(_mm_add_epi32(dst1, _mm_unpackhi_epi16(r, _mm_setzero_si128())), 16);
+ dst0 = _mm_packs_epi32(dst0, dst1);
+ _mm_store_si128((__m128i*)(qv_tmp), dst0);
+
+ t = _mm_cmpeq_epi16(_mm_setzero_si128(), dst0);
+ nz_mask = _mm_movemask_epi8( _mm_packs_epi16(t, t)) & 0xff;
+ dst0 = _mm_mullo_epi16(dst0, const_dq);
+ _mm_store_si128((__m128i*)(q[0].dq), dst0);
+
+
+ src = _mm_load_si128((__m128i*)(q[0].dq + 8));
+ r = _mm_xor_si128(_mm_set1_epi16(qdat[OFFS_RND_INTER]), _mm_cmpgt_epi16(_mm_setzero_si128(), src));
+ lo = _mm_mullo_epi16(src, const_q);
+ hi = _mm_mulhi_epi16(src, const_q);
+ dst0 = _mm_unpacklo_epi16(lo, hi);
+ dst1 = _mm_unpackhi_epi16(lo, hi);
+
+ dst0 = _mm_srai_epi32(_mm_add_epi32(dst0, _mm_unpacklo_epi16(r, _mm_setzero_si128())), 16);
+ dst1 = _mm_srai_epi32(_mm_add_epi32(dst1, _mm_unpackhi_epi16(r, _mm_setzero_si128())), 16);
+ dst0 = _mm_packs_epi32(dst0, dst1);
+ _mm_store_si128((__m128i*)(qv_tmp + 8), dst0);
+
+ t = _mm_cmpeq_epi16(_mm_setzero_si128(), dst0);
+ nz_mask |= _mm_movemask_epi8( _mm_packs_epi16(t, t)) << 8;
+ dst0 = _mm_mullo_epi16(dst0, const_dq);
+ _mm_store_si128((__m128i*)(q[0].dq + 8), dst0);
+ nz_mask = ~nz_mask & 0xffff;
+ if (i0)
+ {
+ nz_mask &= ~1;
+ }
+ }
+
+ zmask >>= 1;
+ nz_block_mask <<= 1;
+ if (nz_mask)
+ nz_block_mask |= 1;
+ q++;
+ } while (--ccol);
+ ccol = mode >> 1;
+ } while (--crow);
+ return nz_block_mask;
+}
+
+static void h264e_transform_add_sse2(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+ int crow = side;
+ int ccol = crow;
+
+ assert(IS_ALIGNED(out, 4));
+ assert(IS_ALIGNED(pred, 4));
+ assert(!(out_stride % 4));
+
+ do
+ {
+ do
+ {
+ if (mask >= 0)
+ {
+ // copy 4x4
+ pix_t *dst = out;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+ }
+ else
+ {
+ __m128i zero = _mm_setzero_si128();
+ __m128i c32 = _mm_set1_epi16(32);
+ __m128i d0, d1, d2, d3;
+ __m128i e0, e1, e2, e3;
+ d0 = _mm_load_si128((__m128i*)(q->dq + 0));
+ d2 = _mm_load_si128((__m128i*)(q->dq + 8));
+ d1 = _mm_unpackhi_epi64(d0, d2);
+ d3 = _mm_unpackhi_epi64(d2, d0);
+
+ e0 = _mm_add_epi16(d0, d2);
+ e1 = _mm_sub_epi16(d0, d2);
+
+ e2 = _mm_srai_epi16(d1, 1);
+ e2 = _mm_sub_epi16(e2, d3);
+ e3 = _mm_srai_epi16(d3, 1);
+ e3 = _mm_add_epi16(e3, d1);
+
+ d0 = _mm_add_epi16(e0, e3);
+ d1 = _mm_add_epi16(e1, e2);
+ d2 = _mm_sub_epi16(e1, e2);
+ d3 = _mm_sub_epi16(e0, e3);
+
+ e1 = _mm_unpacklo_epi16(d0, d1); // a0, b0, a1, b1, a2, b2, a3, b3
+ e3 = _mm_unpacklo_epi16(d2, d3); // c0, d0
+
+ e0 = _mm_unpacklo_epi32(e1, e3); // a0, b0, c0, d0, a1, b1, c1, d1
+ e2 = _mm_unpackhi_epi32(e1, e3); // a2, b2,
+
+ e1 = _mm_unpackhi_epi64(e0, e2);
+ e3 = _mm_unpackhi_epi64(e2, e0);
+
+ d0 = _mm_add_epi16(e0, e2);
+ d1 = _mm_sub_epi16(e0, e2);
+ d2 = _mm_srai_epi16(e1, 1);
+ d2 = _mm_sub_epi16(d2, e3);
+ d3 = _mm_srai_epi16(e3, 1);
+ d3 = _mm_add_epi16(d3, e1);
+
+ // Pack 4x64 to 2x128
+ e0 = _mm_unpacklo_epi64(d0, d1);
+ e1 = _mm_unpacklo_epi64(d3, d2);
+
+ e0 = _mm_add_epi16(e0, c32);
+ d0 = _mm_srai_epi16(_mm_add_epi16(e0, e1), 6);
+ d3 = _mm_srai_epi16(_mm_sub_epi16(e0, e1), 6);
+ // Unpack back to 4x64
+ d1 = _mm_unpackhi_epi64(d0, zero);
+ d2 = _mm_unpackhi_epi64(d3, zero);
+
+ *(int* )(out) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 0)), zero), d0), zero));
+ *(int* )(out + 1*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 16)), zero), d1), zero));
+ *(int* )(out + 2*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 32)), zero), d2), zero));
+ *(int* )(out + 3*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 48)), zero), d3), zero));
+
+ }
+ mask = (uint32_t)mask << 1;
+ q++;
+ out += 4;
+ pred += 4;
+ } while (--ccol);
+ ccol = side;
+ out += 4*(out_stride - ccol);
+ pred += 4*(16 - ccol);
+ } while (--crow);
+}
+#endif
+
+#if H264E_ENABLE_NEON && !defined(MINIH264_ASM)
+#define TR32(x, y) tr0 = vtrnq_u32(vreinterpretq_u32_u8(x), vreinterpretq_u32_u8(y)); x = vreinterpretq_u8_u32(tr0.val[0]); y = vreinterpretq_u8_u32(tr0.val[1]);
+#define TR16(x, y) tr1 = vtrnq_u16(vreinterpretq_u16_u8(x), vreinterpretq_u16_u8(y)); x = vreinterpretq_u8_u16(tr1.val[0]); y = vreinterpretq_u8_u16(tr1.val[1]);
+#define TR8(x, y) tr2 = vtrnq_u8((x), (y)); x = (tr2.val[0]); y = (tr2.val[1]);
+
+static void deblock_luma_v_neon(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+ uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15;
+ uint8x16_t tmp;
+ uint32x4x2_t tr0;
+ uint16x8x2_t tr1;
+ uint8x16x2_t tr2;
+ q8 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q9 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q10= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q11= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q12= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q13= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q14= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q15= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+
+ TR32(q8, q12);
+ TR32(q9, q13);
+ TR32(q10, q14);
+ TR32(q11, q15);
+ TR16(q8, q10);
+ TR16(q9, q11);
+ TR16(q12, q14);
+ TR16(q13, q15);
+ TR8(q8, q9 );
+ TR8(q10, q11);
+ TR8(q12, q13);
+ TR8(q14, q15);
+
+ q1 = vabdq_u8(q11, q12);
+ q2 = vcltq_u8(q1, vdupq_n_u8(alpha));
+ q1 = vcltq_u8(vmaxq_u8(vabdq_u8(q11, q10), vabdq_u8(q12, q13)), vdupq_n_u8(beta));
+ q2 = vandq_u8(q2, q1);
+
+ tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pstr));
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ q1 = tmp;
+
+ q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q2 = vandq_u8(q2, q1);
+ q7 = vhsubq_u8(q10, q13);
+ q7 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(q7), 1));
+ q0 = veorq_u8(q12, q11);
+ q6 = vandq_u8(vdupq_n_u8(1), q0);
+
+ q0 = vhsubq_u8(q12, q11);// ;(q0-p0))>>1
+
+ q7 = vreinterpretq_u8_s8(vrhaddq_s8(vreinterpretq_s8_u8(q7), vreinterpretq_s8_u8(q6))); //((p1-q1)>>2 + carry + 1) >> 1
+ q7 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(q0), vreinterpretq_s8_u8(q7))); //=delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+ q7 = vandq_u8(q7, q2);
+
+ tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pthr));
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ q1 = tmp;
+
+ q1 = vandq_u8(q2, q1);
+
+ q0 = vabdq_u8(q9, q11); // ap = ABS(p2 - p0);
+ q0 = vcltq_u8(q0, vdupq_n_u8(beta)); //sp = (ap - beta) >> 31;
+ q4 = vandq_u8(q0, q2); // & sp
+ q0 = vabdq_u8(q14, q12); //aq = ABS(q2 - q0);
+ q0 = vcltq_u8(q0, vdupq_n_u8(beta)) ;//sq = (aq - beta) >> 31;
+ q3 = vandq_u8(q0, q2); // & sq
+
+ q0 = vrhaddq_u8(q11, q12);//((p0+q0+1)>>1)
+ q0 = vhaddq_u8 (q0, q9 );//((p2 + ((p0+q0+1)>>1))>>1)
+ q5 = vandq_u8 (q1, q4 );
+ q6 = vqaddq_u8 (q10, q5 );//{p1+thr}
+ q0 = vminq_u8 (q0, q6 );
+ q6 = vqsubq_u8 (q10, q5 );//{p1-thr}
+ q10 = vmaxq_u8 (q0, q6 );
+
+ q0 = vrhaddq_u8(q11, q12);// ;((p0+q0+1)>>1)
+ q0 = vhaddq_u8 (q0, q14);// ;((q2 + ((p0+q0+1)>>1))>>1)
+ q5 = vandq_u8 (q1, q3 );
+ q6 = vqaddq_u8 (q13, q5 );// ;{q1+thr}
+ q0 = vminq_u8 (q0, q6 );
+ q6 = vqsubq_u8 (q13, q5 );// ;{q1-thr}
+ q13 = vmaxq_u8 (q0, q6 );
+
+ q1 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q3)));
+ q1 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q4))); //tC = thr - sp - sq;
+ q1 = vandq_u8(q1, q2);// ; set thr = 0 if str==0
+
+ q6 = veorq_u8(q6, q6);
+ q5 = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //delta > 0
+ q7 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7)));
+ q6 = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //-(delta < 0)
+ q5 = vminq_u8(q1, q5);
+ q6 = vminq_u8(q1, q6);
+
+ q11 = vqaddq_u8(q11, q5);
+ q11 = vqsubq_u8(q11, q6);
+ q12 = vqsubq_u8(q12, q5);
+ q12 = vqaddq_u8(q12, q6);
+
+ TR8(q8, q9 );
+ TR8(q10, q11);
+ TR8(q12, q13);
+ TR8(q14, q15);
+ TR16(q8, q10);
+ TR16(q9, q11);
+ TR16(q12, q14);
+ TR16(q13, q15);
+ TR32(q8, q12);
+ TR32(q9, q13);
+ TR32(q10, q14);
+ TR32(q11, q15);
+
+ pix -= 8*stride + 4;
+ vst1_u8(pix, vget_low_u8(q8)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q9)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q10)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q11)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q12)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q13)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q14)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q15)); pix += stride;
+
+ vst1_u8(pix, vget_high_u8(q8)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q9)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q10)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q11)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q12)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q13)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q14)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q15)); pix += stride;
+}
+
+static void deblock_luma_h_s4_neon(uint8_t *pix, int stride, int alpha, int beta)
+{
+ uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, vspill0, vspill1;
+ q8 = vld1q_u8(pix - 4*stride);
+ q9 = vld1q_u8(pix - 3*stride);
+ q10 = vld1q_u8(pix - 2*stride);
+ q11 = vld1q_u8(pix - 1*stride);
+ q12 = vld1q_u8(pix);
+ q13 = vld1q_u8(pix + 1*stride);
+ q14 = vld1q_u8(pix + 2*stride);
+ q15 = vld1q_u8(pix + 3*stride);
+ q0 = vabdq_u8(q11, q12);
+ q2 = vcltq_u8(q0, vdupq_n_u8(alpha));
+ q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q11, q10), vdupq_n_u8(beta)));
+ q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q12, q13), vdupq_n_u8(beta)));
+ q1 = vandq_u8(q2, vcltq_u8(q0, vdupq_n_u8(((alpha >> 2) + 2))));
+ q0 = vandq_u8(q1, vcltq_u8(vabdq_u8(q9, q11), vdupq_n_u8(beta)));
+ q3 = vandq_u8(q1, vcltq_u8(vabdq_u8(q14, q12), vdupq_n_u8(beta)));
+ q4 = vhaddq_u8(q9, q10);
+ q5 = vhaddq_u8(q11, q12);
+ q6 = vsubq_u8(vrhaddq_u8(q9, q10), q4);
+ q7 = vsubq_u8(vrhaddq_u8(q11, q12), q5);
+ q6 = vhaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q4, q8);
+ q4 = vhaddq_u8(q4, q8);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q5, q9);
+ q5 = vhaddq_u8(q5, q9);
+ q7 = vsubq_u8(q7, q5);
+ q6 = vhaddq_u8(q6, q7);
+
+ q7 = vrhaddq_u8(q4, q5);
+ q4 = vhaddq_u8(q4, q5);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vaddq_u8(q4, q6);
+ vspill0 = vbslq_u8(q0, q4, q9); // VMOV q6, q9 VBIT q6, q4, q0
+
+ q4 = vhaddq_u8(q14, q13);
+ q5 = vhaddq_u8(q12, q11);
+ q6 = vsubq_u8(vrhaddq_u8(q14, q13), q4);
+ q7 = vsubq_u8(vrhaddq_u8(q12, q11), q5);
+ q6 = vhaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q4, q15);
+ q4 = vhaddq_u8(q4, q15);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q5, q14);
+ q5 = vhaddq_u8(q5, q14);
+ q7 = vsubq_u8(q7, q5);
+ q6 = vhaddq_u8(q6, q7);
+
+ q7 = vrhaddq_u8(q4, q5);
+ q4 = vhaddq_u8(q4, q5);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vaddq_u8(q4, q6);
+ vspill1 = vbslq_u8(q3, q4, q14); // VMOV q6, q14 VBIT q6, q4, q3
+
+ q1 = vhaddq_u8 (q9, q13);
+ q4 = vrhaddq_u8(q1, q10);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q1, q10);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5);
+ q6 = vrhaddq_u8(q6, q7);
+ q1 = vrhaddq_u8(q4, q6);
+ q4 = vrhaddq_u8(q9, q10);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q9, q10);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vrhaddq_u8(q4, q6);
+ q5 = vhaddq_u8 (q11, q13);
+ q5 = vrhaddq_u8(q5, q10);
+
+ q1 = vbslq_u8(q0, q1, q5); //VBIF q1, q5, q0
+ q0 = vbslq_u8(q0, q4, q10);//VBSL q0, q4, q10
+
+ q7 = vhaddq_u8 (q14, q10);
+ q4 = vrhaddq_u8(q7, q13);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q7, q13);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5 );
+ q6 = vrhaddq_u8(q6, q7 );
+ q4 = vrhaddq_u8(q4, q6 );
+ q6 = vrhaddq_u8(q14, q13);
+ q5 = vrhaddq_u8(q11, q12);
+ q5 = vhaddq_u8 (q6, q5 );
+ q6 = vhaddq_u8 (q14, q13);
+ q7 = vhaddq_u8 (q11, q12);
+ q6 = vrhaddq_u8(q6, q7 );
+ q5 = vrhaddq_u8(q5, q6 );
+ q6 = vhaddq_u8 (q12, q10);
+ q6 = vrhaddq_u8(q6, q13);
+
+ q4 = vbslq_u8(q3, q4, q6); // VBIF q4, q6, q3 ;q0
+ q3 = vbslq_u8(q3, q5, q13);// VBSL q3, q5, q13 ;q1
+
+ q10 = vbslq_u8(q2, q0, q10);
+ q11 = vbslq_u8(q2, q1, q11);
+ q12 = vbslq_u8(q2, q4, q12);
+ q13 = vbslq_u8(q2, q3, q13);
+
+ vst1q_u8(pix - 3*stride, vspill0);
+ vst1q_u8(pix - 2*stride, q10);
+ vst1q_u8(pix - 1*stride, q11);
+ vst1q_u8(pix , q12);
+ vst1q_u8(pix + 1*stride, q13);
+ vst1q_u8(pix + 2*stride, vspill1);
+
+}
+
+static void deblock_luma_v_s4_neon(uint8_t *pix, int stride, int alpha, int beta)
+{
+ uint32x4x2_t tr0;
+ uint16x8x2_t tr1;
+ uint8x16x2_t tr2;
+ uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, vspill0, vspill1;
+ q8 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q9 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q10= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q11= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q12= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q13= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q14= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+ q15= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+
+ TR32(q8, q12);
+ TR32(q9, q13);
+ TR32(q10, q14);
+ TR32(q11, q15);
+ TR16(q8, q10);
+ TR16(q9, q11);
+ TR16(q12, q14);
+ TR16(q13, q15);
+ TR8(q8, q9 );
+ TR8(q10, q11);
+ TR8(q12, q13);
+ TR8(q14, q15);
+
+ q0 = vabdq_u8(q11, q12);
+ q2 = vcltq_u8(q0, vdupq_n_u8(alpha));
+ q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q11, q10), vdupq_n_u8(beta)));
+ q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q12, q13), vdupq_n_u8(beta)));
+ q1 = vandq_u8(q2, vcltq_u8(q0, vdupq_n_u8(((alpha >> 2) + 2))));
+ q0 = vandq_u8(q1, vcltq_u8(vabdq_u8(q9, q11), vdupq_n_u8(beta)));
+ q3 = vandq_u8(q1, vcltq_u8(vabdq_u8(q14, q12), vdupq_n_u8(beta)));
+ q4 = vhaddq_u8(q9, q10);
+ q5 = vhaddq_u8(q11, q12);
+ q6 = vsubq_u8(vrhaddq_u8(q9, q10), q4);
+ q7 = vsubq_u8(vrhaddq_u8(q11, q12), q5);
+ q6 = vhaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q4, q8);
+ q4 = vhaddq_u8(q4, q8);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q5, q9);
+ q5 = vhaddq_u8(q5, q9);
+ q7 = vsubq_u8(q7, q5);
+ q6 = vhaddq_u8(q6, q7);
+
+ q7 = vrhaddq_u8(q4, q5);
+ q4 = vhaddq_u8(q4, q5);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vaddq_u8(q4, q6);
+ vspill0 = vbslq_u8(q0, q4, q9); // VMOV q6, q9 VBIT q6, q4, q0
+
+ q4 = vhaddq_u8(q14, q13);
+ q5 = vhaddq_u8(q12, q11);
+ q6 = vsubq_u8(vrhaddq_u8(q14, q13), q4);
+ q7 = vsubq_u8(vrhaddq_u8(q12, q11), q5);
+ q6 = vhaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q4, q15);
+ q4 = vhaddq_u8(q4, q15);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vaddq_u8(q6, q7);
+ q7 = vrhaddq_u8(q5, q14);
+ q5 = vhaddq_u8(q5, q14);
+ q7 = vsubq_u8(q7, q5);
+ q6 = vhaddq_u8(q6, q7);
+
+ q7 = vrhaddq_u8(q4, q5);
+ q4 = vhaddq_u8(q4, q5);
+ q7 = vsubq_u8(q7, q4);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vaddq_u8(q4, q6);
+ vspill1 = vbslq_u8(q3, q4, q14); // VMOV q6, q14 VBIT q6, q4, q3
+
+ q1 = vhaddq_u8 (q9, q13);
+ q4 = vrhaddq_u8(q1, q10);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q1, q10);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5);
+ q6 = vrhaddq_u8(q6, q7);
+ q1 = vrhaddq_u8(q4, q6);
+ q4 = vrhaddq_u8(q9, q10);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q9, q10);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5);
+ q6 = vrhaddq_u8(q6, q7);
+ q4 = vrhaddq_u8(q4, q6);
+ q5 = vhaddq_u8 (q11, q13);
+ q5 = vrhaddq_u8(q5, q10);
+
+ q1 = vbslq_u8(q0, q1, q5); //VBIF q1, q5, q0
+ q0 = vbslq_u8(q0, q4, q10);//VBSL q0, q4, q10
+
+ q7 = vhaddq_u8 (q14, q10);
+ q4 = vrhaddq_u8(q7, q13);
+ q5 = vrhaddq_u8(q11, q12);
+ q6 = vhaddq_u8 (q7, q13);
+ q7 = vhaddq_u8 (q11, q12);
+ q4 = vhaddq_u8 (q4, q5 );
+ q6 = vrhaddq_u8(q6, q7 );
+ q4 = vrhaddq_u8(q4, q6 );
+ q6 = vrhaddq_u8(q14, q13);
+ q5 = vrhaddq_u8(q11, q12);
+ q5 = vhaddq_u8 (q6, q5 );
+ q6 = vhaddq_u8 (q14, q13);
+ q7 = vhaddq_u8 (q11, q12);
+ q6 = vrhaddq_u8(q6, q7 );
+ q5 = vrhaddq_u8(q5, q6 );
+ q6 = vhaddq_u8 (q12, q10);
+ q6 = vrhaddq_u8(q6, q13);
+
+ q4 = vbslq_u8(q3, q4, q6); // VBIF q4, q6, q3 ;q0
+ q3 = vbslq_u8(q3, q5, q13);// VBSL q3, q5, q13 ;q1
+
+ q10 = vbslq_u8(q2,q0, q10);
+ q11 = vbslq_u8(q2,q1, q11);
+ q12 = vbslq_u8(q2,q4, q12);
+ q13 = vbslq_u8(q2,q3, q13);
+
+ q9 = vspill0;
+ q14 = vspill1;
+
+ TR8(q8, q9 );
+ TR8(q10, q11);
+ TR8(q12, q13);
+ TR8(q14, q15);
+ TR16(q8, q10);
+ TR16(q9, q11);
+ TR16(q12, q14);
+ TR16(q13, q15);
+ TR32(q8, q12);
+ TR32(q9, q13);
+ TR32(q10, q14);
+ TR32(q11, q15);
+
+ pix -= 8*stride + 4;
+ vst1_u8(pix, vget_low_u8(q8)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q9)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q10)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q11)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q12)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q13)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q14)); pix += stride;
+ vst1_u8(pix, vget_low_u8(q15)); pix += stride;
+
+ vst1_u8(pix, vget_high_u8(q8)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q9)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q10)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q11)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q12)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q13)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q14)); pix += stride;
+ vst1_u8(pix, vget_high_u8(q15)); pix += stride;
+}
+
+static void deblock_luma_h_neon(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+ uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q9, q10, q11, q12, q13, q14;
+ uint8x16_t tmp;
+
+ q9 = vld1q_u8(pix - 3*stride);
+ q10 = vld1q_u8(pix - 2*stride);
+ q11 = vld1q_u8(pix - 1*stride);
+ q12 = vld1q_u8(pix);
+ q13 = vld1q_u8(pix + 1*stride);
+ q14 = vld1q_u8(pix + 2*stride);
+
+ q1 = vabdq_u8(q11, q12);
+ q2 = vcltq_u8(q1, vdupq_n_u8(alpha));
+ q1 = vcltq_u8(vmaxq_u8(vabdq_u8(q11, q10), vabdq_u8(q12, q13)), vdupq_n_u8(beta));
+ q2 = vandq_u8(q2, q1);
+
+ tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pstr));
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ q1 = tmp;
+
+ q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q2 = vandq_u8(q2, q1);
+ q7 = vhsubq_u8(q10, q13);
+ q7 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(q7), 1));
+ q0 = veorq_u8(q12, q11);
+ q6 = vandq_u8(vdupq_n_u8(1), q0);
+
+ q0 = vhsubq_u8(q12, q11);// ;(q0-p0))>>1
+
+ q7 = vreinterpretq_u8_s8(vrhaddq_s8(vreinterpretq_s8_u8(q7), vreinterpretq_s8_u8(q6))); //((p1-q1)>>2 + carry + 1) >> 1
+ q7 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(q0), vreinterpretq_s8_u8(q7))); //=delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+ q7 = vandq_u8(q7, q2);
+
+ tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pthr));
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ tmp = vzipq_u8(tmp, tmp).val[0];
+ q1 = tmp;
+
+ q1 = vandq_u8(q2, q1);
+
+ q0 = vabdq_u8(q9, q11); // ap = ABS(p2 - p0);
+ q0 = vcltq_u8(q0, vdupq_n_u8(beta)); //sp = (ap - beta) >> 31;
+ q4 = vandq_u8(q0, q2); // & sp
+ q0 = vabdq_u8(q14, q12);//aq = ABS(q2 - q0);
+ q0 = vcltq_u8(q0, vdupq_n_u8(beta));//sq = (aq - beta) >> 31;
+ q3 = vandq_u8(q0, q2); // & sq
+
+ q0 = vrhaddq_u8(q11, q12);//((p0+q0+1)>>1)
+ q0 = vhaddq_u8 (q0, q9 );//((p2 + ((p0+q0+1)>>1))>>1)
+ q5 = vandq_u8 (q1, q4 );
+ q6 = vqaddq_u8 (q10, q5 );//{p1+thr}
+ q0 = vminq_u8 (q0, q6 );
+ q6 = vqsubq_u8 (q10, q5 );//{p1-thr}
+ q10 = vmaxq_u8 (q0, q6 );
+
+ q0 = vrhaddq_u8(q11, q12);// ;((p0+q0+1)>>1)
+ q0 = vhaddq_u8 (q0, q14);// ;((q2 + ((p0+q0+1)>>1))>>1)
+ q5 = vandq_u8 (q1, q3 );
+ q6 = vqaddq_u8 (q13, q5 );// ;{q1+thr}
+ q0 = vminq_u8 (q0, q6 );
+ q6 = vqsubq_u8 (q13, q5 );// ;{q1-thr}
+ q13 = vmaxq_u8 (q0, q6 );
+
+ q1 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q3)));
+ q1 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q4))); //tC = thr - sp - sq;
+ q1 = vandq_u8(q1, q2);// ; set thr = 0 if str==0
+
+ q6 = veorq_u8(q6, q6);
+ q5 = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //delta > 0
+ q7 = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7)));
+ q6 = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //-(delta < 0)
+ q5 = vminq_u8(q1, q5);
+ q6 = vminq_u8(q1, q6);
+
+ q11 = vqaddq_u8(q11, q5);
+ q11 = vqsubq_u8(q11, q6);
+ q12 = vqsubq_u8(q12, q5);
+ q12 = vqaddq_u8(q12, q6);
+
+ vst1q_u8(pix - 2*stride, q10);
+ vst1q_u8(pix - 1*stride, q11);
+ vst1q_u8(pix , q12);
+ vst1q_u8(pix + 1*stride, q13);
+}
+
+static void deblock_chroma_v_neon(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+ int32x2_t d16 = vld1_s32((int32_t*)(pix - 2 + 0*stride));
+ int32x2_t d18 = vld1_s32((int32_t*)(pix - 2 + 1*stride));
+ int32x2_t d20 = vld1_s32((int32_t*)(pix - 2 + 2*stride));
+ int32x2_t d22 = vld1_s32((int32_t*)(pix - 2 + 3*stride));
+ int32x2_t d17 = vld1_s32((int32_t*)(pix - 2 + 4*stride));
+ int32x2_t d19 = vld1_s32((int32_t*)(pix - 2 + 5*stride));
+ int32x2_t d21 = vld1_s32((int32_t*)(pix - 2 + 6*stride));
+ int32x2_t d23 = vld1_s32((int32_t*)(pix - 2 + 7*stride));
+ int32x2x2_t tr0 = vtrn_s32(d16, d17);
+ int32x2x2_t tr1 = vtrn_s32(d18, d19);
+ int32x2x2_t tr2 = vtrn_s32(d20, d21);
+ int32x2x2_t tr3 = vtrn_s32(d22, d23);
+ int16x8x2_t tr4 = vtrnq_s16(vreinterpretq_s16_s32(vcombine_s32(tr0.val[0], tr0.val[1])), vreinterpretq_s16_s32(vcombine_s32(tr2.val[0], tr2.val[1])));
+ int16x8x2_t tr5 = vtrnq_s16(vreinterpretq_s16_s32(vcombine_s32(tr1.val[0], tr1.val[1])), vreinterpretq_s16_s32(vcombine_s32(tr3.val[0], tr3.val[1])));
+ uint8x16x2_t tr6 = vtrnq_u8(vreinterpretq_u8_s16(tr4.val[0]), vreinterpretq_u8_s16(tr5.val[0]));
+ uint8x16x2_t tr7 = vtrnq_u8(vreinterpretq_u8_s16(tr4.val[1]), vreinterpretq_u8_s16(tr5.val[1]));
+
+{
+ uint8x16_t q8 = tr6.val[0];
+ uint8x16_t q9 = tr6.val[1];
+ uint8x16_t q10 = tr7.val[0];
+ uint8x16_t q11 = tr7.val[1];
+
+ uint8x16_t q1 = vabdq_u8(q9, q10);
+ uint8x16_t q2 = vcltq_u8(q1, vdupq_n_u8(a));
+ uint8x16_t q4 = vmaxq_u8(vabdq_u8(q10, q11), vabdq_u8(q8, q9));
+ uint8x16_t q0;
+ uint8x16_t q3;
+ uint8x16_t q6;
+ int8x16_t q4s;
+ int8x16_t q7;
+ uint8x16_t q7u;
+ uint8x16_t q5;
+ uint8x16_t vstr = vld1q_u8(str);
+ uint8x16_t vthr = vld1q_u8(thr);
+
+ q4 = vcltq_u8(q4, vdupq_n_u8(b));
+ q2 = vandq_u8(q2, q4);
+ q1 = vzipq_u8(vstr, vstr).val[0];
+ q3 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q1 = vshrq_n_u8(q1, 2);
+ q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q2 = vandq_u8(q2, q3);
+
+ q0 = vzipq_u8(vthr, vthr).val[0];
+ q0 = vaddq_u8(q0, vdupq_n_u8(1));
+ q0 = vandq_u8(q0, q2);
+
+ q7 = vshrq_n_s8(vreinterpretq_s8_u8(vhsubq_u8(q8, q11)), 1);
+ q6 = vandq_u8(vdupq_n_u8(1), veorq_u8(q10, q9));
+ q4 = vhsubq_u8(q10, q9);
+ q7 = vrhaddq_s8(q7, vreinterpretq_s8_u8(q6));
+ q7 = vqaddq_s8(vreinterpretq_s8_u8(q4), q7);
+
+ q4s = vdupq_n_s8(0);
+ q5 = vreinterpretq_u8_s8(vmaxq_s8(q4s, q7));
+ q4 = vreinterpretq_u8_s8(vmaxq_s8(q4s, vsubq_s8(q4s, q7)));
+ q5 = vminq_u8(q0, q5);
+ q4 = vminq_u8(q0, q4);
+
+ q0 = vqaddq_u8(q9, q5);
+ q0 = vqsubq_u8(q0, q4);
+ q3 = vqsubq_u8(q10, q5);
+ q3 = vqaddq_u8(q3, q4);
+
+ q6 = vrhaddq_u8(vhaddq_u8(q9, q11), q8);
+ q7u = vrhaddq_u8(vhaddq_u8(q8, q10), q11);
+
+ q0 = vbslq_u8(q1, q6, q0 );
+ q3 = vbslq_u8(q1, q7u, q3 );
+ q9 = vbslq_u8(q2, q0, q9 );
+ q10= vbslq_u8(q2, q3, q10);
+
+ tr6 = vtrnq_u8(q8, q9);
+ tr7 = vtrnq_u8(q10, q11);
+
+ tr4 = vtrnq_s16(vreinterpretq_s16_u8(tr6.val[0]), vreinterpretq_s16_u8(tr7.val[0]));
+ tr5 = vtrnq_s16(vreinterpretq_s16_u8(tr6.val[1]), vreinterpretq_s16_u8(tr7.val[1]));
+
+ tr0 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr4.val[0])), vget_high_s32(vreinterpretq_s32_s16(tr4.val[0])));
+ tr1 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr5.val[0])), vget_high_s32(vreinterpretq_s32_s16(tr5.val[0])));
+ tr2 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr4.val[1])), vget_high_s32(vreinterpretq_s32_s16(tr4.val[1])));
+ tr3 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr5.val[1])), vget_high_s32(vreinterpretq_s32_s16(tr5.val[1])));
+
+#if 0
+ // unaligned store fools Android NDK 15 optimizer
+ *(int32_t*)(uint8_t*)(pix - 2 + 0*stride) = vget_lane_s32(tr0.val[0], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 1*stride) = vget_lane_s32(tr1.val[0], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 2*stride) = vget_lane_s32(tr2.val[0], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 3*stride) = vget_lane_s32(tr3.val[0], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 4*stride) = vget_lane_s32(tr0.val[1], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 5*stride) = vget_lane_s32(tr1.val[1], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 6*stride) = vget_lane_s32(tr2.val[1], 0);
+ *(int32_t*)(uint8_t*)(pix - 2 + 7*stride) = vget_lane_s32(tr3.val[1], 0);
+#else
+ vst1_lane_s16((int16_t*)(pix - 2 + 0*stride), vreinterpret_s16_s32(tr0.val[0]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 0*stride) + 1, vreinterpret_s16_s32(tr0.val[0]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 1*stride), vreinterpret_s16_s32(tr1.val[0]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 1*stride) + 1, vreinterpret_s16_s32(tr1.val[0]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 2*stride), vreinterpret_s16_s32(tr2.val[0]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 2*stride) + 1, vreinterpret_s16_s32(tr2.val[0]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 3*stride), vreinterpret_s16_s32(tr3.val[0]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 3*stride) + 1, vreinterpret_s16_s32(tr3.val[0]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 4*stride), vreinterpret_s16_s32(tr0.val[1]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 4*stride) + 1, vreinterpret_s16_s32(tr0.val[1]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 5*stride), vreinterpret_s16_s32(tr1.val[1]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 5*stride) + 1, vreinterpret_s16_s32(tr1.val[1]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 6*stride), vreinterpret_s16_s32(tr2.val[1]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 6*stride) + 1, vreinterpret_s16_s32(tr2.val[1]), 1);
+ vst1_lane_s16((int16_t*)(pix - 2 + 7*stride), vreinterpret_s16_s32(tr3.val[1]), 0);
+ vst1_lane_s16((int16_t*)(pix - 2 + 7*stride) + 1, vreinterpret_s16_s32(tr3.val[1]), 1);
+#endif
+}
+}
+
+static void deblock_chroma_h_neon(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+ uint8x16_t q0;
+ uint8x16_t q8 = vld1q_u8(pix - 2*stride);
+ uint8x16_t q9 = vld1q_u8(pix - 1*stride);
+ uint8x16_t q10 = vld1q_u8(pix);
+ uint8x16_t q11 = vld1q_u8(pix + stride);
+ uint8x16_t q1 = vabdq_u8(q9, q10);
+ uint8x16_t q2 = vcltq_u8(q1, vdupq_n_u8(a));
+ uint8x16_t q4 = vmaxq_u8(vabdq_u8(q10, q11), vabdq_u8(q8, q9));
+ uint8x16_t q3;
+ uint8x16_t q6;
+ int8x16_t q4s;
+ int8x16_t q7;
+ uint8x16_t q7u;
+ uint8x16_t q5;
+ uint8x16_t vstr = vld1q_u8(str);
+ uint8x16_t vthr = vld1q_u8(thr);
+
+ q4 = vcltq_u8(q4, vdupq_n_u8(b));
+ q2 = vandq_u8(q2, q4);
+ q1 = vzipq_u8(vstr, vstr).val[0];
+ q3 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q1 = vshrq_n_u8(q1, 2);
+ q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+ q2 = vandq_u8(q2, q3);
+
+ q0 = vzipq_u8(vthr, vthr).val[0];
+ q0 = vaddq_u8(q0, vdupq_n_u8(1));
+ q0 = vandq_u8(q0, q2);
+
+ q7 = vshrq_n_s8(vreinterpretq_s8_u8(vhsubq_u8(q8, q11)), 1);
+ q6 = vandq_u8(vdupq_n_u8(1), veorq_u8(q10, q9));
+ q4 = vhsubq_u8(q10, q9);
+ q7 = vrhaddq_s8(q7, vreinterpretq_s8_u8(q6));
+ q7 = vqaddq_s8(vreinterpretq_s8_u8(q4), q7);
+
+ q4s = vdupq_n_s8(0);
+ q5 = vreinterpretq_u8_s8(vmaxq_s8(q4s, q7));
+ q4 = vreinterpretq_u8_s8(vmaxq_s8(q4s, vsubq_s8(q4s, q7)));
+ q5 = vminq_u8(q0, q5);
+ q4 = vminq_u8(q0, q4);
+
+ q0 = vqaddq_u8(q9, q5);
+ q0 = vqsubq_u8(q0, q4);
+ q3 = vqsubq_u8(q10, q5);
+ q3 = vqaddq_u8(q3, q4);
+
+ q6 = vrhaddq_u8(vhaddq_u8(q9, q11), q8);
+ q7u = vrhaddq_u8(vhaddq_u8(q8, q10), q11);
+
+ q0 = vbslq_u8(q1, q6, q0 );
+ q3 = vbslq_u8(q1, q7u, q3 );
+ q9 = vbslq_u8(q2, q0, q9 );
+ q10= vbslq_u8(q2, q3, q10);
+
+ vst1_u8(pix - stride, vget_low_u8(q9));
+ vst1_u8(pix, vget_low_u8(q10));
+}
+
+static void h264e_deblock_chroma_neon(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a, b, x, y;
+ a = alpha[0];
+ b = beta[0];
+ for (x = 0; x < 16; x += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if (str && a)
+ {
+ deblock_chroma_v_neon(pix + (x >> 1), stride, a, b, thr + x, strength + x);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ thr += 16;
+ strength += 16;
+ a = alpha[2];
+ b = beta[2];
+ for (y = 0; y < 16; y += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if (str && a)
+ {
+ deblock_chroma_h_neon(pix, stride, a, b, thr + y, strength + y);
+ }
+ pix += 4*stride;
+ a = alpha[3];
+ b = beta[3];
+ }
+}
+
+static void h264e_deblock_luma_neon(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a = alpha[0];
+ int b = beta[0];
+ int x, y;
+ for (x = 0; x < 16; x += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_v_s4_neon(pix + x, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_v_neon(pix + x, stride, a, b, thr + x, strength + x);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ a = alpha[2];
+ b = beta[2];
+ thr += 16;
+ strength += 16;
+ for (y = 0; y < 16; y += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_h_s4_neon(pix, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_h_neon(pix, stride, a, b, thr + y, strength + y);
+ }
+ a = alpha[3];
+ b = beta[3];
+ pix += 4*stride;
+ }
+}
+
+static void h264e_denoise_run_neon(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+ int cloop, h = h_arg;
+ if (w <= 2 || h <= 2)
+ {
+ return;
+ }
+ w -= 2;
+ h -= 2;
+
+ do
+ {
+ unsigned char *pf = frm += stride_frm;
+ unsigned char *pp = frmprev += stride_frmprev;
+ cloop = w;
+ pp[-stride_frmprev] = *pf++;
+ pp++;
+
+ for (;cloop >= 8; cloop -= 8, pf += 8, pp += 8)
+ {
+ uint16x8_t vp0w;
+ uint32x4_t vpr0;
+ uint32x4_t vpr1;
+ uint16x8_t vf0w;
+ int16x8_t vcls, vt, vcl, vgn, vgd;
+ uint16x8_t vg;
+ uint8x8_t vf0 = vld1_u8(pf);
+ uint8x8_t vft = vld1_u8(pf - stride_frm);
+ uint8x8_t vfb = vld1_u8(pf + stride_frm);
+ uint8x8_t vfl = vld1_u8(pf - 1);
+ uint8x8_t vfr = vld1_u8(pf + 1);
+ uint8x8_t vp0 = vld1_u8(pp);
+ uint8x8_t vpt = vld1_u8(pp - stride_frmprev);
+ uint8x8_t vpb = vld1_u8(pp + stride_frmprev);
+ uint8x8_t vpl = vld1_u8(pp - 1);
+ uint8x8_t vpr = vld1_u8(pp + 1);
+ uint16x8_t vd = vabdl_u8(vf0, vp0);
+ uint16x8_t vfs = vaddw_u8(vaddw_u8(vaddl_u8(vft, vfb), vfl), vfr);
+ uint16x8_t vps = vaddw_u8(vaddw_u8(vaddl_u8(vpt, vpb), vpl), vpr);
+ uint16x8_t vneighbourhood = vshrq_n_u16(vabdq_u16(vfs, vps), 2);
+
+ vt = vaddq_s16(vreinterpretq_s16_u16(vd), vdupq_n_s16(1));
+
+ vt = vqshlq_n_s16(vt, 7);
+ vcls = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcls);
+ vt = vqdmulhq_s16(vt,vt); // 1
+
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 2
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 3
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 4
+ vcl = vclsq_s16(vt);
+ // vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+
+ vgd = vsubq_s16(vdupq_n_s16(127), vcls);
+
+ // same as above
+ vt = vaddq_s16(vreinterpretq_s16_u16(vneighbourhood), vdupq_n_s16(1));
+
+ vt = vqshlq_n_s16(vt, 7);
+ vcls = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcls);
+ vt = vqdmulhq_s16(vt,vt); // 1
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 2
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 3
+ vcl = vclsq_s16(vt);
+ vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+ vt = vqdmulhq_s16(vt,vt); // 4
+ vcl = vclsq_s16(vt);
+ // vt = vshlq_s16(vt, vcl);
+ vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+
+ vgn = vsubq_s16(vdupq_n_s16(127), vcls);
+
+ vgn = vreinterpretq_s16_u16(vshrq_n_u16(vqshlq_n_u16(vreinterpretq_u16_s16(vgn), 10), 8)); // <<=2, saturated
+
+ vgd = vsubq_s16(vdupq_n_s16(255), vgd);
+ vgn = vsubq_s16(vdupq_n_s16(255), vgn);
+
+ //vst1_u8(pp - stride_frmprev, vreinterpret_u8_s8(vmovn_s16(vgn)));
+ //vst1_u8(pp - stride_frmprev, vreinterpret_u8_s8(vmovn_s16(vreinterpretq_s16_u16(vneighbourhood))));
+ //vst1_u8(pp - stride_frmprev, vp0);
+
+ vg = vmulq_u16(vreinterpretq_u16_s16(vgn), vreinterpretq_u16_s16(vgd));
+
+ vp0w = vmovl_u8(vp0);
+ vpr0 = vmull_u16(vget_low_u16(vp0w), vget_low_u16(vg));
+ vpr1 = vmull_u16(vget_high_u16(vp0w), vget_high_u16(vg));
+ vg = vreinterpretq_u16_s16(vsubq_s16(vreinterpretq_s16_u8(vdupq_n_u8(255)), vreinterpretq_s16_u16(vg)));
+
+ vf0w = vmovl_u8(vf0);
+ vpr0 = vmlal_u16(vpr0, vget_low_u16(vf0w), vget_low_u16(vg));
+ vpr1 = vmlal_u16(vpr1, vget_high_u16(vf0w), vget_high_u16(vg));
+
+ vst1_u8(pp - stride_frmprev, vmovn_u16(vcombine_u16(vrshrn_n_u32(vpr0, 16), vrshrn_n_u32(vpr1, 16))));
+ }
+
+ while (cloop--)
+ {
+ int d, neighbourhood;
+ unsigned g, gd, gn, out_val;
+ d = pf[0] - pp[0];
+ neighbourhood = pf[-1] - pp[-1];
+ neighbourhood += pf[+1] - pp[+1];
+ neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+ neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+ if (d < 0)
+ {
+ d = -d;
+ }
+ if (neighbourhood < 0)
+ {
+ neighbourhood = -neighbourhood;
+ }
+ neighbourhood >>= 2;
+
+ gd = g_diff_to_gainQ8[d];
+ gn = g_diff_to_gainQ8[neighbourhood];
+
+ gn <<= 2;
+ if (gn > 255)
+ {
+ gn = 255;
+ }
+
+ gn = 255 - gn;
+ gd = 255 - gd;
+ g = gn*gd; // Q8*Q8 = Q16;
+
+ //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+ //out_val = ((pp[0]*g + (1<<15)) >> 16) + (((0xffff-g)*pf[0] + (1<<15)) >> 16);
+ out_val = (pp[0]*g + (0xffff - g)*pf[0] + (1 << 15)) >> 16;
+
+ assert(out_val <= 255);
+
+ pp[-stride_frmprev] = (unsigned char)out_val;
+ //pp[-stride_frmprev] = gn;
+ //pp[-stride_frmprev] = neighbourhood;
+ //pp[-stride_frmprev] = pp[0];
+
+ pf++, pp++;
+ }
+
+ pp[-stride_frmprev] = *pf++;
+ } while(--h);
+
+ memcpy(frmprev + stride_frmprev, frm + stride_frm, w + 2);
+ h = h_arg - 2;
+ do
+ {
+ memcpy(frmprev, frmprev - stride_frmprev, w + 2);
+ frmprev -= stride_frmprev;
+ } while(--h);
+ memcpy(frmprev, frm - stride_frm*(h_arg - 2), w + 2);
+}
+
+#undef IS_NULL
+#define IS_NULL(p) ((p) < (pix_t *)32)
+
+static uint32_t intra_predict_dc4_neon(const pix_t *left, const pix_t *top)
+{
+ unsigned dc = 0, side = 4, round = 0;
+ uint32x2_t s = vdup_n_u32(0);
+
+ if (!IS_NULL(left))
+ {
+ s = vpaddl_u16(vpaddl_u8(vld1_u8(left)));
+ round += side >> 1;
+ }
+ if (!IS_NULL(top))
+ {
+ s = vadd_u32(s, vpaddl_u16(vpaddl_u8(vld1_u8(top))));
+ round += side >> 1;
+ }
+ dc = vget_lane_u32(s, 0);
+
+ dc += round;
+ if (round == side) dc >>= 1;
+ dc >>= 2;
+ if (!round) dc = 128;
+ return dc * 0x01010101;
+}
+
+static uint8x16_t intra_predict_dc16_neon(const pix_t *left, const pix_t *top)
+{
+ unsigned dc = 0, side = 16, round = 0;
+
+ if (!IS_NULL(left))
+ {
+ uint8x16_t v = vld1q_u8(left);
+ uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v)));
+ uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+ dc += vget_lane_u32(vreinterpret_u32_u64(q), 0);
+ round += side >> 1;
+ }
+ if (!IS_NULL(top))
+ {
+ uint8x16_t v = vld1q_u8(top);
+ uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v)));
+ uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+ dc += vget_lane_u32(vreinterpret_u32_u64(q), 0);
+ round += side >> 1;
+ }
+ dc += round;
+ if (round == side) dc >>= 1;
+ dc >>= 4;
+ if (!round) dc = 128;
+ return vdupq_n_u8(dc);
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ * UL U0 U1 U2 U3 U4 U5 U6 U7
+ * L0 xx xx xx xx
+ * L1 xx xx xx xx
+ * L2 xx xx xx xx
+ * L3 xx xx xx xx
+ */
+#define UL edge[-1]
+#define U0 edge[0]
+#define T1 edge[1]
+#define U2 edge[2]
+#define U3 edge[3]
+#define U4 edge[4]
+#define U5 edge[5]
+#define U6 edge[6]
+#define U7 edge[7]
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16_neon(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 4;
+ uint32_t *d = (uint32_t*)predict;
+ uint32x4_t v;
+ assert(IS_ALIGNED(predict, 4));
+ assert(IS_ALIGNED(top, 4));
+ if (mode != 1)
+ {
+ if (mode < 1)
+ {
+ v = vld1q_u32((uint32_t*)top);
+ } else //(mode == 2)
+ {
+ v = vreinterpretq_u32_u8(intra_predict_dc16_neon(left, top));
+ }
+ do
+ {
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ } while (--cloop);
+ } else //if (mode == 1)
+ {
+ do
+ {
+ vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+ vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+ vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+ vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+ } while (--cloop);
+ }
+}
+
+static void h264e_intra_predict_chroma_neon(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 8;
+ uint32_t *d = (uint32_t*)predict;
+ uint32x4_t v;
+ assert(IS_ALIGNED(predict, 4));
+ assert(IS_ALIGNED(top, 4));
+ if (mode < 1)
+ {
+ v = vld1q_u32((uint32_t*)top);
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ } else if (mode == 1)
+ {
+ do
+ {
+ v = vreinterpretq_u32_u8(vcombine_u8(vdup_n_u8(left[0]), vdup_n_u8(left[8])));
+ vst1q_u32(d, v); d += 4;
+ left++;
+ } while(--cloop);
+ } else //if (mode == 2)
+ {
+ int ccloop = 2;
+ cloop = 2;
+ do
+ {
+ d[0] = d[1] = d[16] = intra_predict_dc4_neon(left, top);
+ d[17] = intra_predict_dc4_neon(left + 4, top + 4);
+ if (!IS_NULL(top))
+ {
+ d[1] = intra_predict_dc4_neon(NULL, top + 4);
+ }
+ if (!IS_NULL(left))
+ {
+ d[16] = intra_predict_dc4_neon(NULL, left + 4);
+ }
+ d += 2;
+ left += 8;
+ top += 8;
+ } while(--cloop);
+
+ do
+ {
+ v = vld1q_u32(d - 4);
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ vst1q_u32(d, v); d += 4;
+ d += 4;
+ } while(--ccloop);
+ }
+}
+
+static __inline int vsad_neon(uint8x16_t a, uint8x16_t b)
+{
+ uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vabdq_u8(a, b))));
+ uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+ return vget_lane_u32(vreinterpret_u32_u64(q), 0);
+}
+
+static int h264e_intra_choose_4x4_neon(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+ int sad, best_sad, best_m = 2;
+
+ uint32_t r0, r1, r2, r3;
+ uint8x16_t vx, vt, vr, vpred, v1, v2, v8, v9, q1, q2, q10, q11, q12;
+ uint8x8_t d2, d3;
+
+ r0 = ((uint32_t *)blockin)[ 0];
+ r1 = ((uint32_t *)blockin)[ 4];
+ r2 = ((uint32_t *)blockin)[ 8];
+ r3 = ((uint32_t *)blockin)[12];
+ vr = vcombine_u8(vcreate_u8(((uint64_t)r1 << 32) | r0), vcreate_u8(((uint64_t)r3 << 32) | r2));
+
+#define VTEST(mode) sad = vsad_neon(vr,vx); \
+ if (mode != mpred) sad += penalty; \
+ if (sad < best_sad) \
+ { \
+ vpred = vx; \
+ best_sad = sad; \
+ best_m = mode; \
+ }
+
+ // DC
+ vx = vdupq_n_u8(intra_predict_dc4_neon((avail & AVAIL_L) ? &L3 : 0, (avail & AVAIL_T) ? &U0 : 0));
+
+ best_sad = vsad_neon(vx, vr);
+ if (2 != mpred)
+ {
+ best_sad += penalty;
+ }
+ vpred = vx;
+
+ vt = vld1q_u8(&L3);
+ vt = vreinterpretq_u8_u32(vsetq_lane_u32(U7*0x01010101, vreinterpretq_u32_u8(vt), 3));
+ if (avail & AVAIL_T)
+ {
+ uint32x2_t t2;
+ if (!(avail & AVAIL_TR))
+ {
+ vt = vcombine_u8(vget_low_u8(vt), vdup_n_u8(U3));
+ }
+
+ vx = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)&U0));
+ VTEST(0);
+
+ vx = vt;
+ vx = vrhaddq_u8(vhaddq_u8(vextq_u8(vx, vx, 5), vextq_u8(vx, vx, 7)), vextq_u8(vx, vx, 6));
+
+ v1 = vextq_u8(vx, vx, 1);
+ d2 = vext_u8(vget_low_u8(vx), vget_low_u8(vx), 2);
+ d3 = vext_u8(vget_low_u8(vx), vget_low_u8(vx), 3);
+ vx = vreinterpretq_u8_u32(vcombine_u32(
+ t2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(vx)), vreinterpret_u32_u8(vget_low_u8(v1))).val[0],
+ vzip_u32(vreinterpret_u32_u8(d2), vreinterpret_u32_u8(d3)).val[0]));
+ VTEST(3);
+
+ vx = vt;
+ vx = vrhaddq_u8(vextq_u8(vt, vt, 5), vextq_u8(vt, vt, 6));
+ vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vx), vreinterpretq_u32_u8(vextq_u8(vx, vx, 1))).val[0]);
+ vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vx),
+ vreinterpretq_u32_u8(vcombine_u8(vreinterpret_u8_u32(t2), vget_high_u8(vextq_u8(vt, vt, 7))))).val[0]);
+
+ VTEST(7);
+ }
+
+ if (avail & AVAIL_L)
+ {
+ vx = vrev32q_u8(vt);
+ vx = vzipq_u8(vx, vx).val[0];
+ vx = vzipq_u8(vx, vx).val[0];
+ VTEST(1);
+
+ v2 = vrev32q_u8(vt);
+ v8 = vrev32q_u8(vt);
+ vx = vrev32q_u8(vt);
+ v8 = vzipq_u8(vx, vx).val[0];
+ {
+ int tmp = vgetq_lane_u16(vreinterpretq_u16_u8(v8), 3);
+ v2 = vreinterpretq_u8_u16(vsetq_lane_u16(tmp, vreinterpretq_u16_u8(v2), 2));
+ v8 = vreinterpretq_u8_u16(vsetq_lane_u16(tmp, vreinterpretq_u16_u8(v8), 4));
+ v9 = vextq_u8(v2, v2, 14);
+ v9 = vzipq_u8(v9, vhaddq_u8(v9, v2)).val[0];
+ v9 = vrhaddq_u8(v9, vextq_u8(v8, v8, 14));
+ tmp |= tmp << 16;
+ vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vextq_u8(v9, v9, 4)),
+ vreinterpretq_u32_u8(vextq_u8(v9, v9, 6))).val[0]);
+ vx = vreinterpretq_u8_u32(vsetq_lane_u32(tmp, vreinterpretq_u32_u8(vx), 3));
+ }
+ VTEST(8);
+ }
+
+ if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+ {
+ uint32x2x2_t pair;
+ uint8x8_t d4, d6;
+ int lr;
+ q11 = q2 = vrhaddq_u8(vhaddq_u8(vt, vextq_u8(vt, vt, 2)), q10 = vextq_u8(vt, vt, 1));
+ d4 = vget_low_u8(q2);
+ d6 = vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(vext_u8(d4, d4, 3)), vreinterpret_u32_u8(vext_u8(d4, d4, 1))).val[0]);
+ d4 = vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(vext_u8(d4, d4, 2)), vreinterpret_u32_u8(d4)).val[0]);
+ pair = vzip_u32(vreinterpret_u32_u8(d6), vreinterpret_u32_u8(d4));
+ vx = vcombine_u8(vreinterpret_u8_u32(pair.val[0]), vreinterpret_u8_u32(pair.val[1]));
+ VTEST(4);
+
+ vx = q12 = vrhaddq_u8(vt, q10);
+ q1 = vzipq_u8(vx, q11).val[0];
+ q1 = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(q1), vreinterpretq_u32_u8(vextq_u8(q1, q1, 2))).val[0]);
+ q1 = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(q1)));
+ vx = vcombine_u8(vget_high_u8(q1), vget_low_u8(q1));
+ vx = vreinterpretq_u8_u16(
+ vsetq_lane_u16(vgetq_lane_u16(vreinterpretq_u16_u8(q11), 2), vreinterpretq_u16_u8(vx), 1));
+ VTEST(6);
+
+ q11 = vextq_u8(q11, q11, 1);
+ q1 = vextq_u8(q12, q12, 4);
+ q2 = vextq_u8(q11, q11, 2);
+ q1 = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(q1), vreinterpretq_u32_u8(q2)).val[0]);
+ q12 = vreinterpretq_u8_u16(vsetq_lane_u16(lr = vgetq_lane_u16(vreinterpretq_u16_u8(q11), 0), vreinterpretq_u16_u8(q12), 1));
+ q11 = vreinterpretq_u8_u16(vsetq_lane_u16((lr << 8) & 0xffff, vreinterpretq_u16_u8(q11), 0));
+ vx = vcombine_u8(vget_low_u8(q1), vreinterpret_u8_u32(vzip_u32(
+ vreinterpret_u32_u8(vext_u8(vget_low_u8(q12), vget_low_u8(q12), 3)),
+ vreinterpret_u32_u8(vext_u8(vget_low_u8(q11), vget_low_u8(q11), 1))
+ ).val[0]));
+ VTEST(5);
+ }
+
+ vst1q_lane_u32(((uint32_t *)blockpred) + 0, vreinterpretq_u32_u8(vpred ), 0);
+ vst1q_lane_u32(((uint32_t *)blockpred) + 4, vreinterpretq_u32_u8(vpred ), 1);
+ vst1q_lane_u32(((uint32_t *)blockpred) + 8, vreinterpretq_u32_u8(vpred ), 2);
+ vst1q_lane_u32(((uint32_t *)blockpred) +12, vreinterpretq_u32_u8(vpred ), 3);
+ return best_m + (best_sad << 4); // pack result
+}
+
+static void copy_wh_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ if (w == 4)
+ {
+ do
+ {
+ *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+ *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+ *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+ *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+ } while (h -= 4);
+ } else if (w == 8)
+ {
+ do
+ {
+ vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+ vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+ vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+ vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+ } while (h -= 4);
+ } else
+ {
+ do
+ {
+ uint8x16_t v0, v1, v2, v3;
+ v0 = vld1q_u8(src); src += src_stride;
+ v1 = vld1q_u8(src); src += src_stride;
+ v2 = vld1q_u8(src); src += src_stride;
+ v3 = vld1q_u8(src); src += src_stride;
+
+ vst1q_u8(dst, v0); dst += 16;
+ vst1q_u8(dst, v1); dst += 16;
+ vst1q_u8(dst, v2); dst += 16;
+ vst1q_u8(dst, v3); dst += 16;
+ } while (h -= 4);
+ }
+}
+
+static void hpel_lpf_hor_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ uint8x8_t c5 = vdup_n_u8(5);
+ uint8x8_t c20 = vshl_n_u8(c5, 2);
+ if (w == 16)
+ {
+ do
+ {
+ uint8x16_t s0 = vld1q_u8(src - 2);
+ uint8x16_t s1 = vld1q_u8(src - 2 + 16);
+ uint8x16_t v0 = s0;
+ uint8x16_t v1 = vextq_u8(s0, s1, 1);
+ uint8x16_t v2 = vextq_u8(s0, s1, 2);
+ uint8x16_t v3 = vextq_u8(s0, s1, 3);
+ uint8x16_t v4 = vextq_u8(s0, s1, 4);
+ uint8x16_t v5 = vextq_u8(s0, s1, 5);
+
+ uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+ s = vmlsl_u8(s, vget_low_u8(v1), c5);
+ s = vmlsl_u8(s, vget_low_u8(v4), c5);
+ s = vmlal_u8(s, vget_low_u8(v2), c20);
+ s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+ q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+ q = vmlsl_u8(q, vget_high_u8(v1), c5);
+ q = vmlsl_u8(q, vget_high_u8(v4), c5);
+ q = vmlal_u8(q, vget_high_u8(v2), c20);
+ q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+ vst1q_u8(dst, vcombine_u8(
+ vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5),
+ vqrshrun_n_s16(vreinterpretq_s16_u16(q), 5)));
+
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ } else
+ {
+ do
+ {
+ uint8x16_t line = vld1q_u8(src - 2);
+ uint8x8_t s0 = vget_low_u8(line);
+ uint8x8_t s1 = vget_high_u8(line);
+ uint8x8_t v0 = s0;
+ uint8x8_t v1 = vext_u8(s0, s1, 1);
+ uint8x8_t v2 = vext_u8(s0, s1, 2);
+ uint8x8_t v3 = vext_u8(s0, s1, 3);
+ uint8x8_t v4 = vext_u8(s0, s1, 4);
+ uint8x8_t v5 = vext_u8(s0, s1, 5);
+
+ uint16x8_t s = vaddl_u8(v0, v5);
+ s = vmlsl_u8(s, v1, c5);
+ s = vmlsl_u8(s, v4, c5);
+ s = vmlal_u8(s, v2, c20);
+ s = vmlal_u8(s, v3, c20);
+
+ vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5));
+
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ }
+}
+
+static void hpel_lpf_hor16_neon(const uint8_t *src, int src_stride, int16_t *h264e_restrict dst, int w, int h)
+{
+ uint8x8_t c5 = vdup_n_u8(5);
+ uint8x8_t c20 = vshl_n_u8(c5, 2);
+ if (w == 16)
+ {
+ do
+ {
+ uint8x16_t s0 = vld1q_u8(src - 2);
+ uint8x16_t s1 = vld1q_u8(src - 2 + 16);
+ uint8x16_t v0 = s0;
+ uint8x16_t v1 = vextq_u8(s0, s1, 1);
+ uint8x16_t v2 = vextq_u8(s0, s1, 2);
+ uint8x16_t v3 = vextq_u8(s0, s1, 3);
+ uint8x16_t v4 = vextq_u8(s0, s1, 4);
+ uint8x16_t v5 = vextq_u8(s0, s1, 5);
+
+ uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+ s = vmlsl_u8(s, vget_low_u8(v1), c5);
+ s = vmlsl_u8(s, vget_low_u8(v4), c5);
+ s = vmlal_u8(s, vget_low_u8(v2), c20);
+ s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+ q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+ q = vmlsl_u8(q, vget_high_u8(v1), c5);
+ q = vmlsl_u8(q, vget_high_u8(v4), c5);
+ q = vmlal_u8(q, vget_high_u8(v2), c20);
+ q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+ vst1q_s16(dst, vreinterpretq_s16_u16(s));
+ vst1q_s16(dst + 8, vreinterpretq_s16_u16(q));
+
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ } else
+ {
+ do
+ {
+ uint8x16_t line = vld1q_u8(src - 2);
+ uint8x8_t s0 = vget_low_u8(line);
+ uint8x8_t s1 = vget_high_u8(line);
+ uint8x8_t v0 = s0;
+ uint8x8_t v1 = vext_u8(s0, s1, 1);
+ uint8x8_t v2 = vext_u8(s0, s1, 2);
+ uint8x8_t v3 = vext_u8(s0, s1, 3);
+ uint8x8_t v4 = vext_u8(s0, s1, 4);
+ uint8x8_t v5 = vext_u8(s0, s1, 5);
+
+ uint16x8_t s = vaddl_u8(v0, v5);
+ s = vmlsl_u8(s, v1, c5);
+ s = vmlsl_u8(s, v4, c5);
+ s = vmlal_u8(s, v2, c20);
+ s = vmlal_u8(s, v3, c20);
+
+ vst1q_s16(dst, vreinterpretq_s16_u16(s));
+
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ }
+}
+
+static void hpel_lpf_ver_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ uint8x8_t c5 = vdup_n_u8(5);
+ uint8x8_t c20 = vshl_n_u8(c5, 2);
+
+ if (w == 16)
+ {
+ uint8x16_t v0 = vld1q_u8(src - 2*src_stride);
+ uint8x16_t v1 = vld1q_u8(src - 1*src_stride);
+ uint8x16_t v2 = vld1q_u8(src);
+ uint8x16_t v3 = vld1q_u8(src + 1*src_stride);
+ uint8x16_t v4 = vld1q_u8(src + 2*src_stride);
+ do
+ {
+ uint8x16_t v5 = vld1q_u8(src + 3*src_stride);
+ uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+ s = vmlsl_u8(s, vget_low_u8(v1), c5);
+ s = vmlsl_u8(s, vget_low_u8(v4), c5);
+ s = vmlal_u8(s, vget_low_u8(v2), c20);
+ s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+ q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+ q = vmlsl_u8(q, vget_high_u8(v1), c5);
+ q = vmlsl_u8(q, vget_high_u8(v4), c5);
+ q = vmlal_u8(q, vget_high_u8(v2), c20);
+ q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+ vst1q_u8(dst, vcombine_u8(
+ vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5),
+ vqrshrun_n_s16(vreinterpretq_s16_u16(q), 5)));
+ dst += 16;
+ src += src_stride;
+ v0 = v1;
+ v1 = v2;
+ v2 = v3;
+ v3 = v4;
+ v4 = v5;
+ } while (--h);
+ } else
+ {
+ uint8x8_t v0 = vld1_u8(src - 2*src_stride);
+ uint8x8_t v1 = vld1_u8(src - 1*src_stride);
+ uint8x8_t v2 = vld1_u8(src);
+ uint8x8_t v3 = vld1_u8(src + 1*src_stride);
+ uint8x8_t v4 = vld1_u8(src + 2*src_stride);
+ do
+ {
+ uint8x8_t v5 = vld1_u8(src + 3*src_stride);
+ uint16x8_t s = vaddl_u8(v0, v5);
+ s = vmlsl_u8(s, v1, c5);
+ s = vmlsl_u8(s, v4, c5);
+ s = vmlal_u8(s, v2, c20);
+ s = vmlal_u8(s, v3, c20);
+
+ vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5));
+ dst += 16;
+ src += src_stride;
+ v0 = v1;
+ v1 = v2;
+ v2 = v3;
+ v3 = v4;
+ v4 = v5;
+ } while (--h);
+ }
+}
+
+static void hpel_lpf_ver16_neon(const int16_t *src, uint8_t *h264e_restrict dst, int w, int h)
+{
+ do
+ {
+ int cloop = h;
+ int16x8_t v0 = vld1q_s16(src);
+ int16x8_t v1 = vld1q_s16(src + 16);
+ int16x8_t v2 = vld1q_s16(src + 16*2);
+ int16x8_t v3 = vld1q_s16(src + 16*3);
+ int16x8_t v4 = vld1q_s16(src + 16*4);
+ do
+ {
+ int16x8_t v5 = vld1q_s16(src+16*5);
+
+ int16x8_t s0 = vaddq_s16(v0, v5);
+ int16x8_t s1 = vaddq_s16(v1, v4);
+ int16x8_t s2 = vaddq_s16(v2, v3);
+
+ int16x8_t vs = vshrq_n_s16(vsubq_s16(s0, s1), 2);
+ int16x8_t vq = vsubq_s16(s2, s1);
+ s0 = vshrq_n_s16(vaddq_s16(vq, vs), 2);
+ s0 = vaddq_s16(s0, s2);
+
+ vst1_u8(dst, vqrshrun_n_s16(s0, 6));
+
+ dst += 16;
+ src += 16;
+ v0 = v1;
+ v1 = v2;
+ v2 = v3;
+ v3 = v4;
+ v4 = v5;
+ } while (--cloop);
+
+ src -= 16*h - 8;
+ dst -= 16*h - 8;
+ } while (w -= 8);
+}
+
+static void hpel_lpf_diag_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16); /* 21 rows by 16 pixels per row */
+
+ /*
+ * Intermediate values will be 1/2 pel at Horizontal direction
+ * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+ * scratch contains a 2D array of size (w)X(h + 5)
+ */
+ hpel_lpf_hor16_neon(src - 2*src_stride, src_stride, scratch, w, h + 5);
+ hpel_lpf_ver16_neon(scratch, dst, w, h);
+}
+
+static void average_16x16_unalign_neon(uint8_t *dst, const uint8_t *src, int src_stride)
+{
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src))); src += src_stride; dst += 16;
+}
+
+static void h264e_qpel_average_wh_align_neon(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, point_t wh)
+{
+ int w = wh.s.x;
+ int h = wh.s.y;
+ int cloop = h;
+ if (w == 8)
+ {
+ do
+ {
+ vst1_u8(dst, vrhadd_u8(vld1_u8(src0), vld1_u8(src1)));
+ dst += 16;
+ src0 += 16;
+ src1 += 16;
+ } while (--cloop);
+ } else
+ {
+ do
+ {
+ vst1q_u8(dst, vrhaddq_u8(vld1q_u8(src0), vld1q_u8(src1)));
+ dst += 16;
+ src0 += 16;
+ src1 += 16;
+ } while (--cloop);
+ }
+}
+
+static void h264e_qpel_interpolate_luma_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+// src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride; // dx == 3 ? next row; dy == 3 ? next line
+// dxdy actions: Horizontal, Vertical, Diagonal, Average
+// 0 1 2 3 +1 - ha h ha+
+// 1 va hva hda hv+a
+// 2 v vda d v+da
+// 3 va+ h+va h+da h+v+a
+// +stride
+ int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+
+ if (pos == 1)
+ {
+ copy_wh_neon(src, src_stride, dst, wh.s.x, wh.s.y);
+ } else
+ {
+ ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+ int dstused = 0;
+ if (pos & 0xe0ee)// 1110 0000 1110 1110
+ {
+ hpel_lpf_hor_neon(src + ((pos & 0xe000) ? src_stride : 0), src_stride, dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0xbbb0)// 1011 1011 1011 0000
+ {
+ hpel_lpf_ver_neon(src + ((pos & 0x8880) ? 1 : 0), src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0x4e40)// 0100 1110 0100 0000
+ {
+ hpel_lpf_diag_neon(src, src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0xfafa)// 1111 1010 1111 1010
+ {
+ assert(wh.s.x == 16 && wh.s.y == 16);
+ if (dstused == 2)
+ {
+ point_t p;
+
+ src = scratch;
+ src_stride = 16;
+ p.u32 = 16 + (16 << 16);
+
+ h264e_qpel_average_wh_align_neon(src, dst, dst, p);
+ } else
+ {
+ src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+ average_16x16_unalign_neon(dst, src, src_stride);
+ }
+ }
+ }
+}
+
+static void h264e_qpel_interpolate_chroma_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+ /* if fractionl mv is not (0, 0) */
+ if (dxdy.u32)
+ {
+ uint8x8_t v8 = vdup_n_u8(8);
+ uint8x8_t vx = vdup_n_u8(dxdy.s.x);
+ uint8x8_t vy = vdup_n_u8(dxdy.s.y);
+ uint8x8_t v8x = vsub_u8(v8, vx);
+ uint8x8_t v8y = vsub_u8(v8, vy);
+ uint8x8_t va = vmul_u8(v8x, v8y);
+ uint8x8_t vb = vmul_u8(vx, v8y);
+ uint8x8_t vc = vmul_u8(v8x, vy);
+ uint8x8_t vd = vmul_u8(vx, vy);
+ int h = wh.s.y;
+ if (wh.s.x == 8)
+ {
+ uint8x16_t vt0 = vld1q_u8(src);
+ uint8x16_t vt1 = vextq_u8(vt0, vt0, 1);
+ src += src_stride;
+ do
+ {
+ uint8x16_t vb0 = vld1q_u8(src);
+ uint8x16_t vb1 = vextq_u8(vb0, vb0, 1);
+ uint16x8_t vs = vmull_u8(vget_low_u8(vt0), va);
+ vs = vmlal_u8(vs, vget_low_u8(vt1), vb);
+ vs = vmlal_u8(vs, vget_low_u8(vb0), vc);
+ vs = vmlal_u8(vs, vget_low_u8(vb1), vd);
+ vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(vs), 6));
+ vt0 = vb0;
+ vt1 = vb1;
+ dst += 16;
+ src += src_stride;
+ } while(--h);
+ } else
+ {
+ uint8x8_t vt0 = vld1_u8(src);
+ uint8x8_t vt1 = vext_u8(vt0, vt0, 1);
+ src += src_stride;
+ do
+ {
+ uint8x8_t vb0 = vld1_u8(src);
+ uint8x8_t vb1 = vext_u8(vb0, vb0, 1);
+ uint16x8_t vs = vmull_u8(vt0, va);
+ vs = vmlal_u8(vs, vt1, vb);
+ vs = vmlal_u8(vs, vb0, vc);
+ vs = vmlal_u8(vs, vb1, vd);
+ *(int32_t*)dst = vget_lane_s32(vreinterpret_s32_u8(vqrshrun_n_s16(vreinterpretq_s16_u16(vs), 6)), 0);
+ vt0 = vb0;
+ vt1 = vb1;
+ dst += 16;
+ src += src_stride;
+ } while(--h);
+ }
+ } else
+ {
+ copy_wh_neon(src, src_stride, dst, wh.s.x, wh.s.y);
+ }
+}
+
+static int h264e_sad_mb_unlaign_8x8_neon(const pix_t *a, int a_stride, const pix_t *b, int _sad[4])
+{
+ uint16x8_t s0, s1;
+ uint8x16_t va, vb;
+ int cloop = 2, sum = 0;
+ do
+ {
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabdl_u8( vget_low_u8(va), vget_low_u8(vb)); s1 = vabdl_u8( vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ {
+ uint32x4_t v0 = vpaddlq_u16(s0);
+ uint64x2_t v1 = vpaddlq_u32(v0);
+ sum += _sad[0] = (int)(vgetq_lane_u64(v1, 0)+vgetq_lane_u64(v1, 1));
+ v0 = vpaddlq_u16(s1);
+ v1 = vpaddlq_u32(v0);
+ sum += _sad[1] = (int)(vgetq_lane_u64(v1, 0)+vgetq_lane_u64(v1, 1));
+ _sad += 2;
+ }
+ } while(--cloop);
+ return sum;
+}
+
+static int h264e_sad_mb_unlaign_wh_neon(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+ uint16x8_t s0, s1;
+ uint8x16_t va, vb;
+ int cloop = wh.s.y/8, sum = 0;
+ if (wh.s.x == 16)
+ {
+ do
+ {
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabdl_u8( vget_low_u8(va), vget_low_u8(vb)); s1 = vabdl_u8( vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb)); s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb));
+
+ uint32x4_t v0 = vpaddlq_u16(s0);
+ uint64x2_t v1 = vpaddlq_u32(v0);
+ sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+
+ v0 = vpaddlq_u16(s1);
+ v1 = vpaddlq_u32(v0);
+ sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+ } while(--cloop);
+ } else
+ {
+ do
+ {
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabdl_u8( vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+ va = vld1q_u8(a), vb = vld1q_u8(b); a += a_stride, b += 16;
+ s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+
+ uint32x4_t v0 = vpaddlq_u16(s0);
+ uint64x2_t v1 = vpaddlq_u32(v0);
+ sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+ } while(--cloop);
+ }
+ return sum;
+}
+
+static void h264e_copy_8x8_neon(pix_t *d, int d_stride, const pix_t *s)
+{
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+ vst1_u8(d, vld1_u8(s)); s += 16; d += d_stride;
+}
+
+
+static void h264e_copy_16x16_neon(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+ assert(!((unsigned)d & 7));
+ assert(!((unsigned)s & 7));
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+ vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+}
+
+// Keep intermediate data in transposed format.
+// Save transpose for vectorized implementation
+// TODO: TRANSPOSE_BLOCK==0 broken
+#define TRANSPOSE_BLOCK 0
+#define UNZIGSAG_IN_QUANT 0
+
+#define SUM_DIF(a, b) { int t = a + b; b = a - b; a = t; }
+
+static void hadamar4_2d_neon(int16_t *x)
+{
+ int16x8_t q0 = vld1q_s16(x);
+ int16x8_t q1 = vld1q_s16(x + 8);
+ int16x8_t s = vaddq_s16(q0, q1);
+ int16x8_t d = vsubq_s16(q0, q1);
+ int16x8_t q2 = vcombine_s16(vget_low_s16(s), vget_low_s16(d));
+ int16x8_t q3 = vcombine_s16(vget_high_s16(s), vget_high_s16(d));
+ q0 = vaddq_s16(q2, q3);
+ d = vsubq_s16(q2, q3);
+ q1 = vcombine_s16(vget_high_s16(d), vget_low_s16(d));
+{
+ int16x4x2_t t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+ int16x4x2_t t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+ int32x4x2_t tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+ q0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[0])));
+ q1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[1])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+
+ s = vaddq_s16(q0, q1);
+ d = vsubq_s16(q0, q1);
+ q2 = vcombine_s16(vget_low_s16(s), vget_low_s16(d));
+ q3 = vcombine_s16(vget_high_s16(s), vget_high_s16(d));
+ q0 = vaddq_s16(q2, q3);
+ d = vsubq_s16(q2, q3);
+ q1 = vcombine_s16(vget_high_s16(d), vget_low_s16(d));
+ vst1q_s16(x, q0);
+ vst1q_s16(x + 8, q1);
+}
+}
+
+static void dequant_dc_neon(quant_t *q, int16_t *qval, int dequant, int n)
+{
+ do q++->dq[0] = (int16_t)(*qval++*(int16_t)dequant); while (--n);
+}
+
+static void quant_dc_neon(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+#if 1
+ int r_minus = (1 << 18) - round_q18;
+ static const uint8_t iscan16[16] = {0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15};
+ static const uint8_t iscan4[4] = {0, 1, 2, 3};
+ const uint8_t *scan = n == 4 ? iscan4 : iscan16;
+ do
+ {
+ int v = *qval;
+ int r = v < 0 ? r_minus : round_q18;
+ deq[*scan++] = *qval++ = (v * quant + r) >> 18;
+ } while (--n);
+#else
+ int r_minus = (1 << 18) - round_q18;
+ do
+ {
+ int v = *qval;
+ int r = v < 0 ? r_minus : round_q18;
+ *deq++ = *qval++ = (v * quant + r) >> 18;
+ } while (--n);
+#endif
+}
+
+static void hadamar2_2d_neon(int16_t *x)
+{
+ int a = x[0];
+ int b = x[1];
+ int c = x[2];
+ int d = x[3];
+ x[0] = (int16_t)(a + b + c + d);
+ x[1] = (int16_t)(a - b + c - d);
+ x[2] = (int16_t)(a + b - c - d);
+ x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc_neon(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar4_2d_neon(tmp);
+ quant_dc_neon(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+ hadamar4_2d_neon(tmp);
+ assert(!(qdat[1] & 3));
+ // dirty trick here: shift w/o rounding, since it have no effect for qp >= 10 (or, to be precise, for qp => 9)
+ dequant_dc_neon(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc_neon(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar2_2d_neon(tmp);
+ quant_dc_neon(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+ hadamar2_2d_neon(tmp);
+ assert(!(qdat[1] & 1));
+ dequant_dc_neon(q, tmp, qdat[1] >> 1, 4);
+ return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+#define TRANSFORM(x0, x1, x2, x3, p, s) { \
+ int t0 = x0 + x3; \
+ int t1 = x0 - x3; \
+ int t2 = x1 + x2; \
+ int t3 = x1 - x2; \
+ (p)[ 0] = (int16_t)(t0 + t2); \
+ (p)[ s] = (int16_t)(t1*2 + t3); \
+ (p)[2*s] = (int16_t)(t0 - t2); \
+ (p)[3*s] = (int16_t)(t1 - t3*2); \
+}
+
+static void FwdTransformResidual4x42_neon(const uint8_t *inp, const uint8_t *pred, uint32_t inp_stride, int16_t *out)
+{
+#if TRANSPOSE_BLOCK
+ int i;
+ int16_t tmp[16];
+ // Transform columns
+ for (i = 0; i < 4; i++, pred++, inp++)
+ {
+ int f0 = inp[0] - pred[0];
+ int f1 = inp[1*inp_stride] - pred[1*16];
+ int f2 = inp[2*inp_stride] - pred[2*16];
+ int f3 = inp[3*inp_stride] - pred[3*16];
+ TRANSFORM(f0, f1, f2, f3, tmp + i*4, 1);
+ }
+ // Transform rows
+ for (i = 0; i < 4; i++)
+ {
+ int d0 = tmp[i + 0];
+ int d1 = tmp[i + 4];
+ int d2 = tmp[i + 8];
+ int d3 = tmp[i + 12];
+ TRANSFORM(d0, d1, d2, d3, out + i, 4);
+ }
+#else
+ /* Transform rows */
+ uint8x8_t inp0 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(inp)), vreinterpret_s32_u8(vld1_u8(inp + inp_stride))).val[0]);
+ uint8x8_t inp1 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(inp + 2*inp_stride)), vreinterpret_s32_u8(vld1_u8(inp + 3*inp_stride))).val[0]);
+ uint8x8_t pred0 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred)), vreinterpret_s32_u8(vld1_u8(pred + 16))).val[0]);
+ uint8x8_t pred1 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred + 2*16)), vreinterpret_s32_u8(vld1_u8(pred + 3*16))).val[0]);
+ int16x8_t q0 = vreinterpretq_s16_u16(vsubl_u8(inp0, pred0));
+ int16x8_t q1 = vreinterpretq_s16_u16(vsubl_u8(inp1, pred1));
+
+ int16x4x2_t t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+ int16x4x2_t t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+ int32x4x2_t tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+ int16x4_t d4 = vadd_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+ int16x4_t d5 = vsub_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+ int16x4_t d6 = vadd_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+ int16x4_t d7 = vsub_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+ int16x8_t q2 = vcombine_s16(d4, d5);
+ int16x8_t q3 = vcombine_s16(d6, d7);
+ q0 = vaddq_s16(q2, q3);
+ q0 = vcombine_s16(vget_low_s16(q0), vadd_s16(vget_high_s16(q0), d5));
+ q1 = vsubq_s16(q2, q3);
+ q1 = vcombine_s16(vget_low_s16(q1), vsub_s16(vget_high_s16(q1), d7));
+
+ t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+ t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+ tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+ d4 = vadd_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+ d5 = vsub_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+ d6 = vadd_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+ d7 = vsub_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+ q2 = vcombine_s16(d4, d5);
+ q3 = vcombine_s16(d6, d7);
+ q0 = vaddq_s16(q2, q3);
+ q0 = vcombine_s16(vget_low_s16(q0), vadd_s16(vget_high_s16(q0), d5));
+ q1 = vsubq_s16(q2, q3);
+ q1 = vcombine_s16(vget_low_s16(q1), vsub_s16(vget_high_s16(q1), d7));
+
+ vst1q_s16(out, q0);
+ vst1q_s16(out + 8, q1);
+#endif
+}
+
+static void TransformResidual4x4_neon(const int16_t *pSrc, const pix_t *pred, pix_t *out, int out_stride)
+{
+ int16x4_t e0, e1, e2, e3;
+ int16x4_t f0, f1, f2, f3;
+ int16x4_t g0, g1, g2, g3;
+ int16x4_t h0, h1, h2, h3;
+ int16x4_t d0 = vld1_s16(pSrc);
+ int16x4_t d1 = vld1_s16(pSrc + 4);
+ int16x4_t d2 = vld1_s16(pSrc + 8);
+ int16x4_t d3 = vld1_s16(pSrc + 12);
+ int16x4x2_t dd0 = vtrn_s16(d0, d1);
+ int16x4x2_t dd1 = vtrn_s16(d2, d3);
+ int32x4x2_t d = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(dd0.val[0], dd0.val[1])), vreinterpretq_s32_s16(vcombine_s16(dd1.val[0], dd1.val[1])));
+ d0 = vreinterpret_s16_s32(vget_low_s32(d.val[0]));
+ d1 = vreinterpret_s16_s32(vget_high_s32(d.val[0]));
+ d2 = vreinterpret_s16_s32(vget_low_s32(d.val[1]));
+ d3 = vreinterpret_s16_s32(vget_high_s32(d.val[1]));
+
+ e0 = vadd_s16(d0, d2);
+ e1 = vsub_s16(d0, d2);
+ e2 = vsub_s16(vshr_n_s16(d1, 1), d3);
+ e3 = vadd_s16(d1, vshr_n_s16(d3, 1));
+ f0 = vadd_s16(e0, e3);
+ f1 = vadd_s16(e1, e2);
+ f2 = vsub_s16(e1, e2);
+ f3 = vsub_s16(e0, e3);
+
+ dd0 = vtrn_s16(f0, f1);
+ dd1 = vtrn_s16(f2, f3);
+ d = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(dd0.val[0], dd0.val[1])), vreinterpretq_s32_s16(vcombine_s16(dd1.val[0], dd1.val[1])));
+ f0 = vreinterpret_s16_s32(vget_low_s32(d.val[0]));
+ f1 = vreinterpret_s16_s32(vget_high_s32(d.val[0]));
+ f2 = vreinterpret_s16_s32(vget_low_s32(d.val[1]));
+ f3 = vreinterpret_s16_s32(vget_high_s32(d.val[1]));
+
+ g0 = vadd_s16(f0, f2);
+ g1 = vsub_s16(f0, f2);
+ g2 = vsub_s16(vshr_n_s16(f1, 1), f3);
+ g3 = vadd_s16(f1, vshr_n_s16(f3, 1));
+ h0 = vadd_s16(g0, g3);
+ h1 = vadd_s16(g1, g2);
+ h2 = vsub_s16(g1, g2);
+ h3 = vsub_s16(g0, g3);
+
+ {
+ uint8x8_t inp0 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred)), vreinterpret_s32_u8(vld1_u8(pred + 16))).val[0]);
+ uint8x8_t inp1 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred + 2*16)), vreinterpret_s32_u8(vld1_u8(pred + 3*16))).val[0]);
+ int16x8_t a0 = vaddq_s16(vcombine_s16(h0, h1), vreinterpretq_s16_u16(vshll_n_u8(inp0, 6)));
+ int16x8_t a1 = vaddq_s16(vcombine_s16(h2, h3), vreinterpretq_s16_u16(vshll_n_u8(inp1, 6)));
+ uint8x8_t r0 = vqrshrun_n_s16(a0, 6);
+ uint8x8_t r1 = vqrshrun_n_s16(a1, 6);
+ *(uint32_t*)(&out[0*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r0), 0);
+ *(uint32_t*)(&out[1*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r0), 1);
+ *(uint32_t*)(&out[2*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r1), 0);
+ *(uint32_t*)(&out[3*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r1), 1);
+ }
+}
+
+static int is_zero_neon(const int16_t *dat, int i0, const uint16_t *thr)
+{
+ static const uint16x8_t g_ign_first = { 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
+ int16x8_t v0 = vabsq_s16(*(int16x8_t *)dat);
+ int16x8_t v1 = vabsq_s16(*(int16x8_t *)(dat + 8));
+ int16x8_t t = *(int16x8_t *)thr;
+ uint16x8_t m0 = vcgtq_s16(v0, t);
+ uint16x8_t m1 = vcgtq_s16(v1, t);
+ if (i0)
+ m0 = vandq_u16(m0, g_ign_first);
+ m0 = vorrq_u16(m0, m1);
+ uint16x4_t m4 = vorr_u16(vget_low_u16(m0), vget_high_u16(m0));
+ return !(vget_lane_u32(vreinterpret_u32_u16(m4), 0) | vget_lane_u32(vreinterpret_u32_u16(m4), 1));
+}
+
+static int is_zero4_neon(const quant_t *q, int i0, const uint16_t *thr)
+{
+ return is_zero_neon(q[0].dq, i0, thr) &&
+ is_zero_neon(q[1].dq, i0, thr) &&
+ is_zero_neon(q[4].dq, i0, thr) &&
+ is_zero_neon(q[5].dq, i0, thr);
+}
+
+static int zero_smallq_neon(quant_t *q, int mode, const uint16_t *qdat)
+{
+ int zmask = 0;
+ int i, i0 = mode & 1, n = mode >> 1;
+ if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+ {
+ for (i = 0; i < n*n; i++)
+ {
+ if (is_zero_neon(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+ {
+ zmask |= (1 << i); //9.19
+ }
+ }
+ if (mode == QDQ_MODE_INTER) //8.27
+ {
+ if ((~zmask & 0x0033) && is_zero4_neon(q + 0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+ if ((~zmask & 0x00CC) && is_zero4_neon(q + 2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+ if ((~zmask & 0x3300) && is_zero4_neon(q + 8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+ if ((~zmask & 0xCC00) && is_zero4_neon(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+ }
+ }
+ return zmask;
+}
+
+static int quantize_neon(quant_t *q, int mode, const uint16_t *qdat, int zmask)
+{
+#if UNZIGSAG_IN_QUANT
+#if TRANSPOSE_BLOCK
+ // ; Zig-zag scan Transposed zig-zag
+ // ; 0 1 5 6 0 2 3 9
+ // ; 2 4 7 C 1 4 8 A
+ // ; 3 8 B D 5 7 B E
+ // ; 9 A E F 6 C D F
+ static const unsigned char iscan16[16] = {0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15};
+#else
+ static const unsigned char iscan16[16] = {0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15};
+#endif
+#endif
+ int ccol, crow, nz_block_mask = 0;
+ ccol = mode >> 1;
+ crow = ccol;
+ do
+ {
+ do
+ {
+ int nz_mask = 0;
+
+ if (zmask & 1)
+ {
+ int32_t *p = (int32_t *)q->qv;
+ *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+ *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+ } else
+ {
+ static const uint8_t iscan16_neon [] = {
+ 0x00,0x01,0x02,0x03,0x08,0x09,0x10,0x11,
+ 0x0A,0x0B,0x04,0x05,0x06,0x07,0x0C,0x0D,
+ 0x12,0x13,0x18,0x19,0x1A,0x1B,0x14,0x15,
+ 0x0E,0x0F,0x16,0x17,0x1C,0x1D,0x1E,0x1F};
+ static const uint16_t imask16_neon [] = {
+ 0x0001,0x0002,0x0004,0x0008,
+ 0x0010,0x0020,0x0040,0x0080,
+ 0x0100,0x0200,0x0400,0x0800,
+ 0x1000,0x2000,0x4000,0x8000};
+ short save = 0;
+ uint8x16_t q8,q9;
+ int16x8_t q0 = vld1q_s16(q->dq);
+ int16x8_t q1 = vld1q_s16(q->dq + 8);
+ uint16x8_t r = vdupq_n_u16(qdat[OFFS_RND_INTER]);
+ uint16x8_t r0 = veorq_u16(r, vcltq_s16(q0, vdupq_n_s16(0)));
+ uint16x8_t r1 = veorq_u16(r, vcltq_s16(q1, vdupq_n_s16(0)));
+ int16x4_t d4, d5, d6, d7;
+ int16x4_t d22, d23, d24, d25;
+ int16x4_t d26, d27, d28, d29;
+
+ d4 = d6 = vdup_n_s16(qdat[2]);
+ d5 = d7 = vdup_n_s16(qdat[3]);
+ d4 = vset_lane_s16(qdat[0], d4, 0);
+ d4 = vset_lane_s16(qdat[0], d4, 2);
+ d5 = vset_lane_s16(qdat[1], d5, 0);
+ d5 = vset_lane_s16(qdat[1], d5, 2);
+ d6 = vset_lane_s16(qdat[4], d6, 1);
+ d6 = vset_lane_s16(qdat[4], d6, 3);
+ d7 = vset_lane_s16(qdat[5], d7, 1);
+ d7 = vset_lane_s16(qdat[5], d7, 3);
+
+ d22 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_low_s16(q0), d4)), vget_low_u16(r0))), 16);
+ d26 = vmul_s16(d22, d5);
+ d23 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_high_s16(q0), d6)), vget_high_u16(r0))), 16);
+ d27 = vmul_s16(d23, d7);
+ d24 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_low_s16(q1), d4)), vget_low_u16(r1))), 16);
+ d28 = vmul_s16(d24, d5);
+ d25 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_high_s16(q1), d6)), vget_high_u16(r1))), 16);
+ d29 = vmul_s16(d25, d7);
+ if (mode & 1)
+ {
+ save = q->dq[0];
+ }
+ vst1q_s16(q->dq, vcombine_s16(d26, d27));
+ vst1q_s16(q->dq + 8, vcombine_s16(d28, d29));
+ if (mode & 1)
+ {
+ q->dq[0] = save;
+ }
+
+ if (mode & 1)
+ {
+ save = q->qv[0];
+ }
+ q8 = vld1q_u8(iscan16_neon);
+ q9 = vld1q_u8(iscan16_neon + 16);
+
+ {
+// vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && defined(__apple_build_version__)
+ uint8x16x2_t vlut;
+ vlut.val[0] = vreinterpretq_u8_s16(vcombine_s16(d22, d23));
+ vlut.val[1] = vreinterpretq_u8_s16(vcombine_s16(d24, d25));
+ vst1_s16(q->qv + 0, d4 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_low_u8(q8))));
+ vst1_s16(q->qv + 4, d5 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_high_u8(q8))));
+ vst1_s16(q->qv + 8, d6 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_low_u8(q9))));
+ vst1_s16(q->qv +12, d7 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_high_u8(q9))));
+#else
+ uint8x8x4_t vlut;
+ vlut.val[0] = vreinterpret_u8_s16(d22);
+ vlut.val[1] = vreinterpret_u8_s16(d23);
+ vlut.val[2] = vreinterpret_u8_s16(d24);
+ vlut.val[3] = vreinterpret_u8_s16(d25);
+ vst1_s16(q->qv + 0, d4 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_low_u8(q8))));
+ vst1_s16(q->qv + 4, d5 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_high_u8(q8))));
+ vst1_s16(q->qv + 8, d6 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_low_u8(q9))));
+ vst1_s16(q->qv +12, d7 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_high_u8(q9))));
+#endif
+ }
+ {
+ uint16x8_t bm0 = vld1q_u16(imask16_neon);
+ uint16x8_t bm1 = vld1q_u16(imask16_neon + 8);
+ uint16x4_t m;
+ bm0 = vandq_u16(bm0, vceqq_s16(vcombine_s16(d4, d5), vdupq_n_s16(0)));
+ bm1 = vandq_u16(bm1, vceqq_s16(vcombine_s16(d6, d7), vdupq_n_s16(0)));
+ bm0 = vorrq_u16(bm0, bm1);
+ m = vorr_u16(vget_low_u16(bm0), vget_high_u16(bm0));
+ m = vpadd_u16(m, m);
+ m = vpadd_u16(m, m);
+ nz_mask = vget_lane_u16(vmvn_u16(m), 0);
+ }
+
+ if (mode & 1)
+ {
+ q->qv[0] = save;
+ nz_mask &= ~1;
+ }
+ }
+
+ zmask >>= 1;
+ nz_block_mask <<= 1;
+ if (nz_mask)
+ nz_block_mask |= 1;
+ q++;
+ } while (--ccol);
+ ccol = mode >> 1;
+ } while (--crow);
+ return nz_block_mask;
+}
+
+static void transform_neon(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q)
+{
+ int crow = mode >> 1;
+ int ccol = crow;
+
+ do
+ {
+ do
+ {
+ FwdTransformResidual4x42_neon(inp, pred, inp_stride, q->dq);
+ q++;
+ inp += 4;
+ pred += 4;
+ } while (--ccol);
+ ccol = mode >> 1;
+ inp += 4*(inp_stride - ccol);
+ pred += 4*(16 - ccol);
+ } while (--crow);
+}
+
+static int h264e_transform_sub_quant_dequant_neon(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+ int zmask;
+ transform_neon(inp, pred, inp_stride, mode, q);
+ if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+ {
+ int cloop = (mode >> 1)*(mode >> 1);
+ short *dc = ((short *)q) - 16;
+ quant_t *pq = q;
+ do
+ {
+ *dc++ = pq->dq[0];
+ pq++;
+ } while (--cloop);
+ }
+ zmask = zero_smallq_neon(q, mode, qdat);
+ return quantize_neon(q, mode, qdat, zmask);
+}
+
+static void h264e_transform_add_neon(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+ int crow = side;
+ int ccol = crow;
+
+ assert(!((unsigned)out % 4));
+ assert(!((unsigned)pred % 4));
+ assert(!(out_stride % 4));
+ do
+ {
+ do
+ {
+ if (mask >= 0)
+ {
+ // copy 4x4
+ pix_t *dst = out;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+ } else
+ {
+ TransformResidual4x4_neon(q->dq, pred, out, out_stride);
+ }
+ mask <<= 1;
+ q++;
+ out += 4;
+ pred += 4;
+ } while (--ccol);
+ ccol = side;
+ out += 4*(out_stride - ccol);
+ pred += 4*(16 - ccol);
+ } while (--crow);
+}
+#endif
+
+#if H264E_ENABLE_PLAIN_C
+
+static uint8_t byteclip_deblock(int x)
+{
+ if (x > 255)
+ {
+ return 255;
+ }
+ if (x < 0)
+ {
+ return 0;
+ }
+ return (uint8_t)x;
+}
+
+static int clip_range(int range, int src)
+{
+ if (src > range)
+ {
+ src = range;
+ }
+ if (src < -range)
+ {
+ src = -range;
+ }
+ return src;
+}
+
+static void deblock_chroma(uint8_t *pix, int stride, int alpha, int beta, int thr, int strength)
+{
+ int p1, p0, q0, q1;
+ int delta;
+
+ if (strength == 0)
+ {
+ return;
+ }
+
+ p1 = pix[-2*stride];
+ p0 = pix[-1*stride];
+ q0 = pix[ 0*stride];
+ q1 = pix[ 1*stride];
+
+ if (ABS(p0 - q0) >= alpha || ABS(p1 - p0) >= beta || ABS(q1 - q0) >= beta)
+ {
+ return;
+ }
+
+ if (strength < 4)
+ {
+ int tC = thr + 1;
+ delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+ delta = clip_range(tC, delta);
+ pix[-1*stride] = byteclip_deblock(p0 + delta);
+ pix[ 0*stride] = byteclip_deblock(q0 - delta);
+ } else
+ {
+ pix[-1*stride] = (pix_t)((2*p1 + p0 + q1 + 2) >> 2);
+ pix[ 0*stride] = (pix_t)((2*q1 + q0 + p1 + 2) >> 2);
+ }
+}
+
+static void deblock_luma_v(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+ int p2, p1, p0, q0, q1, q2, thr;
+ int ap, aq, delta, cloop, i;
+ for (i = 0; i < 4; i++)
+ {
+ cloop = 4;
+ if (pstr[i])
+ {
+ thr = pthr[i];
+ do
+ {
+ p1 = pix[-2];
+ p0 = pix[-1];
+ q0 = pix[ 0];
+ q1 = pix[ 1];
+
+ //if (ABS(p0 - q0) < alpha && ABS(p1 - p0) < beta && ABS(q1 - q0) < beta)
+ if (((ABS(p0 - q0) - alpha) & (ABS(p1 - p0) - beta) & (ABS(q1 - q0) - beta)) < 0)
+ {
+ int tC, sp, sq, d2;
+ // avoid conditons
+ p2 = pix[-3];
+ q2 = pix[ 2];
+ ap = ABS(p2 - p0);
+ aq = ABS(q2 - q0);
+ delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+
+ sp = (ap - beta) >> 31;
+ sq = (aq - beta) >> 31;
+ d2 = (((p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1) & sp;
+ d2 = clip_range(thr, d2);
+ pix[-2] = (pix_t)(p1 + d2);
+ d2 = (((q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1) & sq;
+ d2 = clip_range(thr, d2);
+ pix[ 1] = (pix_t)(q1 + d2);
+ tC = thr - sp - sq;
+ delta = clip_range(tC, delta);
+ pix[-1] = byteclip_deblock(p0 + delta);
+ pix[ 0] = byteclip_deblock(q0 - delta);
+ }
+ pix += stride;
+ } while (--cloop);
+ } else
+ {
+ pix += 4*stride;
+ }
+ }
+}
+
+static void deblock_luma_h_s4(uint8_t *pix, int stride, int alpha, int beta)
+{
+ int p3, p2, p1, p0, q0, q1, q2, q3;
+ int ap, aq, cloop = 16;
+ do
+ {
+ int abs_p0_q0, abs_p1_p0, abs_q1_q0;
+ p1 = pix[-2*stride];
+ p0 = pix[-1*stride];
+ q0 = pix[ 0*stride];
+ q1 = pix[ 1*stride];
+ abs_p0_q0 = ABS(p0 - q0);
+ abs_p1_p0 = ABS(p1 - p0);
+ abs_q1_q0 = ABS(q1 - q0);
+ if (abs_p0_q0 < alpha && abs_p1_p0 < beta && abs_q1_q0 < beta)
+ {
+ int short_p = (2*p1 + p0 + q1 + 2);
+ int short_q = (2*q1 + q0 + p1 + 2);
+
+ if (abs_p0_q0 < ((alpha>>2)+2))
+ {
+ p2 = pix[-3*stride];
+ q2 = pix[ 2*stride];
+ ap = ABS(p2 - p0);
+ aq = ABS(q2 - q0);
+ if (ap < beta)
+ {
+ int t = p2 + p1 + p0 + q0 + 2;
+ p3 = pix[-4*stride];
+ short_p += t - p1 + q0; //(p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
+ short_p >>= 1;
+ pix[-2*stride] = (pix_t)(t >> 2);
+ pix[-3*stride] = (pix_t)((2*p3 + 2*p2 + t + 2) >> 3); //(2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
+ }
+ if (aq < beta)
+ {
+ int t = q2 + q1 + p0 + q0 + 2;
+ q3 = pix[ 3*stride];
+ short_q += (t - q1 + p0);//(q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
+ short_q >>= 1;
+ pix[ 1*stride] = (pix_t)(t >> 2);
+ pix[ 2*stride] = (pix_t)((2*q3 + 2*q2 + t + 2) >> 3); //((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
+ }
+ }
+ pix[-1*stride] = (pix_t)(short_p >> 2);
+ pix[ 0*stride] = (pix_t)(short_q >> 2);
+ }
+ pix += 1;
+ } while (--cloop);
+}
+
+static void deblock_luma_v_s4(uint8_t *pix, int stride, int alpha, int beta)
+{
+ int p3, p2, p1, p0, q0, q1, q2, q3;
+ int ap, aq, cloop = 16;
+ do
+ {
+ p2 = pix[-3];
+ p1 = pix[-2];
+ p0 = pix[-1];
+ q0 = pix[ 0];
+ q1 = pix[ 1];
+ q2 = pix[ 2];
+ if (ABS(p0 - q0) < alpha && ABS(p1 - p0) < beta && ABS(q1 - q0) < beta)
+ {
+ ap = ABS(p2 - p0);
+ aq = ABS(q2 - q0);
+
+ if (ap < beta && ABS(p0 - q0) < ((alpha >> 2) + 2))
+ {
+ p3 = pix[-4];
+ pix[-1] = (pix_t)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
+ pix[-2] = (pix_t)((p2 + p1 + p0 + q0 + 2) >> 2);
+ pix[-3] = (pix_t)((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
+ } else
+ {
+ pix[-1] = (pix_t)((2*p1 + p0 + q1 + 2) >> 2);
+ }
+
+ if (aq < beta && ABS(p0 - q0) < ((alpha >> 2) + 2))
+ {
+ q3 = pix[ 3];
+ pix[ 0] = (pix_t)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
+ pix[ 1] = (pix_t)((q2 + q1 + p0 + q0 + 2) >> 2);
+ pix[ 2] = (pix_t)((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
+ } else
+ {
+ pix[ 0] = (pix_t)((2*q1 + q0 + p1 + 2) >> 2);
+ }
+ }
+ pix += stride;
+ } while (--cloop);
+}
+
+static void deblock_luma_h(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+ int p2, p1, p0, q0, q1, q2;
+ int ap, aq, delta, i;
+ for (i = 0; i < 4; i++)
+ {
+ if (pstr[i])
+ {
+ int cloop = 4;
+ int thr = pthr[i];
+ do
+ {
+ p1 = pix[-2*stride];
+ p0 = pix[-1*stride];
+ q0 = pix[ 0*stride];
+ q1 = pix[ 1*stride];
+
+ //if (ABS(p0-q0) < alpha && ABS(p1-p0) < beta && ABS(q1-q0) < beta)
+ if (((ABS(p0-q0) - alpha) & (ABS(p1-p0) - beta) & (ABS(q1-q0) - beta)) < 0)
+ {
+ int tC, sp, sq, d2;
+ p2 = pix[-3*stride];
+ q2 = pix[ 2*stride];
+ ap = ABS(p2 - p0);
+ aq = ABS(q2 - q0);
+ delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+
+ sp = (ap - beta) >> 31;
+ d2 = (((p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1) & sp;
+ d2 = clip_range(thr, d2);
+ pix[-2*stride] = (pix_t)(p1 + d2);
+
+ sq = (aq - beta) >> 31;
+ d2 = (((q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1) & sq;
+ d2 = clip_range(thr, d2);
+ pix[ 1*stride] = (pix_t)(q1 + d2);
+
+ tC = thr - sp - sq;
+ delta = clip_range(tC, delta);
+
+ pix[-1*stride] = byteclip_deblock(p0 + delta);
+ pix[ 0*stride] = byteclip_deblock(q0 - delta);
+ }
+ pix += 1;
+ } while (--cloop);
+ } else
+ {
+ pix += 4;
+ }
+ }
+}
+
+static void deblock_chroma_v(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ deblock_chroma(pix, 1, a, b, thr[i >> 1], str[i >> 1]);
+ pix += stride;
+ }
+}
+
+static void deblock_chroma_h(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ deblock_chroma(pix, stride, a, b, thr[i >> 1], str[i >> 1]);
+ pix += 1;
+ }
+}
+
+static void h264e_deblock_chroma(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a,b,x,y;
+ a = alpha[0];
+ b = beta[0];
+ for (x = 0; x < 16; x += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if (str && a)
+ {
+ deblock_chroma_v(pix + (x >> 1), stride, a, b, thr + x, strength + x);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ thr += 16;
+ strength += 16;
+ a = alpha[2];
+ b = beta[2];
+ for (y = 0; y < 16; y += 8)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if (str && a)
+ {
+ deblock_chroma_h(pix, stride, a, b, thr + y, strength + y);
+ }
+ pix += 4*stride;
+ a = alpha[3];
+ b = beta[3];
+ }
+}
+
+static void h264e_deblock_luma(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+ const uint8_t *alpha = par->alpha;
+ const uint8_t *beta = par->beta;
+ const uint8_t *thr = par->tc0;
+ const uint8_t *strength = (uint8_t *)par->strength32;
+ int a = alpha[0];
+ int b = beta[0];
+ int x, y;
+ for (x = 0; x < 16; x += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[x];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_v_s4(pix + x, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_v(pix + x, stride, a, b, thr + x, strength + x);
+ }
+ a = alpha[1];
+ b = beta[1];
+ }
+ a = alpha[2];
+ b = beta[2];
+ thr += 16;
+ strength += 16;
+ for (y = 0; y < 16; y += 4)
+ {
+ uint32_t str = *(uint32_t*)&strength[y];
+ if ((uint8_t)str == 4)
+ {
+ deblock_luma_h_s4(pix, stride, a, b);
+ } else if (str && a)
+ {
+ deblock_luma_h(pix, stride, a, b, thr + y, strength + y);
+ }
+ a = alpha[3];
+ b = beta[3];
+ pix += 4*stride;
+ }
+}
+
+static void h264e_denoise_run(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+ int cloop, h = h_arg;
+ if (w <= 2 || h <= 2)
+ {
+ return;
+ }
+ w -= 2;
+ h -= 2;
+
+ do
+ {
+ unsigned char *pf = frm += stride_frm;
+ unsigned char *pp = frmprev += stride_frmprev;
+ cloop = w;
+ pp[-stride_frmprev] = *pf++;
+ pp++;
+ do
+ {
+ int d, neighbourhood;
+ unsigned g, gd, gn, out_val;
+ d = pf[0] - pp[0];
+ neighbourhood = pf[-1] - pp[-1];
+ neighbourhood += pf[+1] - pp[+1];
+ neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+ neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+ if (d < 0)
+ {
+ d = -d;
+ }
+ if (neighbourhood < 0)
+ {
+ neighbourhood = -neighbourhood;
+ }
+ neighbourhood >>= 2;
+
+ gd = g_diff_to_gainQ8[d];
+ gn = g_diff_to_gainQ8[neighbourhood];
+
+ gn <<= 2;
+ if (gn > 255)
+ {
+ gn = 255;
+ }
+
+ gn = 255 - gn;
+ gd = 255 - gd;
+ g = gn*gd; // Q8*Q8 = Q16;
+
+ //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+ //out_val = ((pp[0]*g + (1<<15)) >> 16) + (((0xffff-g)*pf[0] + (1<<15)) >> 16);
+ out_val = (pp[0]*g + (0xffff - g)*pf[0] + (1 << 15)) >> 16;
+
+ assert(out_val <= 255);
+
+ pp[-stride_frmprev] = (unsigned char)out_val;
+ //pp[-stride_frmprev] = gd;//(unsigned char)((neighbourhood+1)>255?255:(neighbourhood+1));
+
+ pf++, pp++;
+ } while (--cloop);
+
+ pp[-stride_frmprev] = *pf;
+ } while(--h);
+
+ memcpy(frmprev + stride_frmprev, frm + stride_frm, w + 2);
+ h = h_arg - 2;
+ do
+ {
+ memcpy(frmprev, frmprev - stride_frmprev, w + 2);
+ frmprev -= stride_frmprev;
+ } while(--h);
+ memcpy(frmprev, frm - stride_frm*(h_arg - 2), w + 2);
+}
+
+#undef IS_NULL
+#define IS_NULL(p) ((p) < (pix_t *)32)
+
+static uint32_t intra_predict_dc(const pix_t *left, const pix_t *top, int log_side)
+{
+ unsigned dc = 0, side = 1u << log_side, round = 0;
+ do
+ {
+ if (!IS_NULL(left))
+ {
+ int cloop = side;
+ round += side >> 1;
+ do
+ {
+ dc += *left++;
+ dc += *left++;
+ dc += *left++;
+ dc += *left++;
+ } while(cloop -= 4);
+ }
+ left = top;
+ top = NULL;
+ } while (left);
+ dc += round;
+ if (round == side)
+ dc >>= 1;
+ dc >>= log_side;
+ if (!round) dc = 128;
+ return dc * 0x01010101;
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ * UL U0 U1 U2 U3 U4 U5 U6 U7
+ * L0 xx xx xx xx
+ * L1 xx xx xx xx
+ * L2 xx xx xx xx
+ * L3 xx xx xx xx
+ */
+#define UL edge[-1]
+#define U0 edge[0]
+#define T1 edge[1]
+#define U2 edge[2]
+#define U3 edge[3]
+#define U4 edge[4]
+#define U5 edge[5]
+#define U6 edge[6]
+#define U7 edge[7]
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 16;
+ uint32_t *d = (uint32_t*)predict;
+ assert(IS_ALIGNED(predict, 4));
+ assert(IS_ALIGNED(top, 4));
+ if (mode != 1)
+ {
+ uint32_t t0, t1, t2, t3;
+ if (mode < 1)
+ {
+ t0 = ((uint32_t*)top)[0];
+ t1 = ((uint32_t*)top)[1];
+ t2 = ((uint32_t*)top)[2];
+ t3 = ((uint32_t*)top)[3];
+ } else //(mode == 2)
+ {
+ t0 = t1 = t2 = t3 = intra_predict_dc(left, top, 4);
+ }
+ do
+ {
+ *d++ = t0;
+ *d++ = t1;
+ *d++ = t2;
+ *d++ = t3;
+ } while (--cloop);
+ } else //if (mode == 1)
+ {
+ do
+ {
+ uint32_t val = *left++ * 0x01010101u;
+ *d++ = val;
+ *d++ = val;
+ *d++ = val;
+ *d++ = val;
+ } while (--cloop);
+ }
+}
+
+static void h264e_intra_predict_chroma(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+ int cloop = 8;
+ uint32_t *d = (uint32_t*)predict;
+ assert(IS_ALIGNED(predict, 4));
+ assert(IS_ALIGNED(top, 4));
+ if (mode < 1)
+ {
+ uint32_t t0, t1, t2, t3;
+ t0 = ((uint32_t*)top)[0];
+ t1 = ((uint32_t*)top)[1];
+ t2 = ((uint32_t*)top)[2];
+ t3 = ((uint32_t*)top)[3];
+ do
+ {
+ *d++ = t0;
+ *d++ = t1;
+ *d++ = t2;
+ *d++ = t3;
+ } while (--cloop);
+ } else if (mode == 1)
+ {
+ do
+ {
+ uint32_t u = left[0] * 0x01010101u;
+ uint32_t v = left[8] * 0x01010101u;
+ d[0] = u;
+ d[1] = u;
+ d[2] = v;
+ d[3] = v;
+ d += 4;
+ left++;
+ } while(--cloop);
+ } else //if (mode == 2)
+ {
+ int ccloop = 2;
+ cloop = 2;
+ do
+ {
+ d[0] = d[1] = d[16] = intra_predict_dc(left, top, 2);
+ d[17] = intra_predict_dc(left + 4, top + 4, 2);
+ if (!IS_NULL(top))
+ {
+ d[1] = intra_predict_dc(NULL, top + 4, 2);
+ }
+ if (!IS_NULL(left))
+ {
+ d[16] = intra_predict_dc(NULL, left + 4, 2);
+ }
+ d += 2;
+ left += 8;
+ top += 8;
+ } while(--cloop);
+
+ do
+ {
+ cloop = 12;
+ do
+ {
+ *d = d[-4];
+ d++;
+ } while(--cloop);
+ d += 4;
+ } while(--ccloop);
+ }
+}
+
+static int pix_sad_4(uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3,
+ uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
+{
+#if defined(__arm__)
+ int sad = __usad8(r0, x0);
+ sad = __usada8(r1, x1, sad);
+ sad = __usada8(r2, x2, sad);
+ sad = __usada8(r3, x3, sad);
+ return sad;
+#else
+ int c, sad = 0;
+ for (c = 0; c < 4; c++)
+ {
+ int d = (r0 & 0xff) - (x0 & 0xff); r0 >>= 8; x0 >>= 8;
+ sad += ABS(d);
+ }
+ for (c = 0; c < 4; c++)
+ {
+ int d = (r1 & 0xff) - (x1 & 0xff); r1 >>= 8; x1 >>= 8;
+ sad += ABS(d);
+ }
+ for (c = 0; c < 4; c++)
+ {
+ int d = (r2 & 0xff) - (x2 & 0xff); r2 >>= 8; x2 >>= 8;
+ sad += ABS(d);
+ }
+ for (c = 0; c < 4; c++)
+ {
+ int d = (r3 & 0xff) - (x3 & 0xff); r3 >>= 8; x3 >>= 8;
+ sad += ABS(d);
+ }
+ return sad;
+#endif
+}
+
+static int h264e_intra_choose_4x4(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+ int sad, best_sad, best_m = 2;
+
+ uint32_t r0, r1, r2, r3;
+ uint32_t x0, x1, x2, x3, x;
+
+ r0 = ((uint32_t *)blockin)[ 0];
+ r1 = ((uint32_t *)blockin)[ 4];
+ r2 = ((uint32_t *)blockin)[ 8];
+ r3 = ((uint32_t *)blockin)[12];
+#undef TEST
+#define TEST(mode) sad = pix_sad_4(r0, r1, r2, r3, x0, x1, x2, x3); \
+ if (mode != mpred) sad += penalty; \
+ if (sad < best_sad) \
+ { \
+ ((uint32_t *)blockpred)[ 0] = x0; \
+ ((uint32_t *)blockpred)[ 4] = x1; \
+ ((uint32_t *)blockpred)[ 8] = x2; \
+ ((uint32_t *)blockpred)[12] = x3; \
+ best_sad = sad; \
+ best_m = mode; \
+ }
+
+ // DC
+ x0 = x1 = x2 = x3 = intra_predict_dc((avail & AVAIL_L) ? &L3 : 0, (avail & AVAIL_T) ? &U0 : 0, 2);
+ best_sad = pix_sad_4(r0, r1, r2, r3, x0, x1, x2, x3);
+ if (2 != mpred)
+ {
+ best_sad += penalty;
+ }
+ ((uint32_t *)blockpred)[ 0] = x0;
+ ((uint32_t *)blockpred)[ 4] = x1;
+ ((uint32_t *)blockpred)[ 8] = x2;
+ ((uint32_t *)blockpred)[12] = x3;
+
+
+ if (avail & AVAIL_T)
+ {
+ uint32_t save = *(uint32_t*)&U4;
+ if (!(avail & AVAIL_TR))
+ {
+ *(uint32_t*)&U4 = U3*0x01010101u;
+ }
+
+ x0 = x1 = x2 = x3 = *(uint32_t*)&U0;
+ TEST(0)
+
+ x = ((U6 + 3u*U7 + 2u) >> 2) << 24;
+ x |= ((U5 + 2u*U6 + U7 + 2u) >> 2) << 16;
+ x |= ((U4 + 2u*U5 + U6 + 2u) >> 2) << 8;
+ x |= ((U3 + 2u*U4 + U5 + 2u) >> 2);
+
+ x3 = x;
+ x = (x << 8) | ((U2 + 2u*U3 + U4 + 2u) >> 2);
+ x2 = x;
+ x = (x << 8) | ((T1 + 2u*U2 + U3 + 2u) >> 2);
+ x1 = x;
+ x = (x << 8) | ((U0 + 2u*T1 + U2 + 2u) >> 2);
+ x0 = x;
+ TEST(3)
+
+ x3 = x1;
+ x1 = x0;
+
+ x = ((U4 + U5 + 1u) >> 1) << 24;
+ x |= ((U3 + U4 + 1u) >> 1) << 16;
+ x |= ((U2 + U3 + 1u) >> 1) << 8;
+ x |= ((T1 + U2 + 1u) >> 1);
+ x2 = x;
+ x = (x << 8) | ((U0 + T1 + 1) >> 1);
+ x0 = x;
+ TEST(7)
+
+ *(uint32_t*)&U4 = save;
+ }
+
+ if (avail & AVAIL_L)
+ {
+ x0 = 0x01010101u * L0;
+ x1 = 0x01010101u * L1;
+ x2 = 0x01010101u * L2;
+ x3 = 0x01010101u * L3;
+ TEST(1)
+
+ x = x3;
+ x <<= 16;
+ x |= ((L2 + 3u*L3 + 2u) >> 2) << 8;
+ x |= ((L2 + L3 + 1u) >> 1);
+ x2 = x;
+ x <<= 16;
+ x |= ((L1 + 2u*L2 + L3 + 2u) >> 2) << 8;
+ x |= ((L1 + L2 + 1u) >> 1);
+ x1 = x;
+ x <<= 16;
+ x |= ((L0 + 2u*L1 + L2 + 2u) >> 2) << 8;
+ x |= ((L0 + L1 + 1u) >> 1);
+ x0 = x;
+ TEST(8)
+ }
+
+ if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+ {
+ uint32_t line0, line3;
+ x = ((U3 + 2u*U2 + T1 + 2u) >> 2) << 24;
+ x |= ((U2 + 2u*T1 + U0 + 2u) >> 2) << 16;
+ x |= ((T1 + 2u*U0 + UL + 2u) >> 2) << 8;
+ x |= ((U0 + 2u*UL + L0 + 2u) >> 2);
+ line0 = x;
+ x0 = x;
+ x = (x << 8) | ((UL + 2u*L0 + L1 + 2u) >> 2);
+ x1 = x;
+ x = (x << 8) | ((L0 + 2u*L1 + L2 + 2u) >> 2);
+ x2 = x;
+ x = (x << 8) | ((L1 + 2u*L2 + L3 + 2u) >> 2);
+ x3 = x;
+ line3 = x;
+ TEST(4)
+
+ x = x0 << 8;
+ x |= ((UL + L0 + 1u) >> 1);
+ x0 = x;
+ x <<= 8;
+ x |= (line3 >> 16) & 0xff;
+ x <<= 8;
+ x |= ((L0 + L1 + 1u) >> 1);
+ x1 = x;
+ x <<= 8;
+ x |= (line3 >> 8) & 0xff;
+ x <<= 8;
+ x |= ((L1 + L2 + 1u) >> 1);
+ x2 = x;
+ x <<= 8;
+ x |= line3 & 0xff;
+ x <<= 8;
+ x |= ((L2 + L3 + 1u) >> 1);
+ x3 = x;
+ TEST(6)
+
+ x1 = line0;
+ x3 = (x1 << 8) | ((line3 >> 8) & 0xFF);
+
+ x = ((U2 + U3 + 1u) >> 1) << 24;
+ x |= ((T1 + U2 + 1u) >> 1) << 16;
+ x |= ((U0 + T1 + 1u) >> 1) << 8;
+ x |= ((UL + U0 + 1u) >> 1);
+ x0 = x;
+ x = (x << 8) | ((line3 >> 16) & 0xFF);
+ x2 = x;
+ TEST(5)
+ }
+ return best_m + (best_sad << 4);
+}
+
+static uint8_t byteclip(int x)
+{
+ if (x > 255) x = 255;
+ if (x < 0) x = 0;
+ return (uint8_t)x;
+}
+
+static int hpel_lpf(const uint8_t *p, int s)
+{
+ return p[0] - 5*p[s] + 20*p[2*s] + 20*p[3*s] - 5*p[4*s] + p[5*s];
+}
+
+static void copy_wh(const uint8_t *src, int src_stride, uint8_t *dst, int w, int h)
+{
+ int x, y;
+ for (y = 0; y < h; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ dst [x] = src [x];
+ }
+ dst += 16;
+ src += src_stride;
+ }
+}
+
+static void hpel_lpf_diag(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16); /* 21 rows by 16 pixels per row */
+
+ /*
+ * Intermediate values will be 1/2 pel at Horizontal direction
+ * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+ * scratch contains a 2D array of size (w)X(h + 5)
+ */
+ int y, x;
+ for (y = 0; y < h + 5; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ scratch[y * w + x] = (int16_t)hpel_lpf(src + (y - 2) * src_stride + (x - 2), 1);
+ }
+ }
+
+ /* Vertical interpolate */
+ for (y = 0; y < h; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ int pos = y * w + x;
+ int HalfCoeff =
+ scratch [pos] -
+ 5 * scratch [pos + 1 * w] +
+ 20 * scratch [pos + 2 * w] +
+ 20 * scratch [pos + 3 * w] -
+ 5 * scratch [pos + 4 * w] +
+ scratch [pos + 5 * w];
+
+ HalfCoeff = byteclip((HalfCoeff + 512) >> 10);
+
+ dst [y * 16 + x] = (uint8_t)HalfCoeff;
+ }
+ }
+}
+
+static void hpel_lpf_hor(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ int x, y;
+ for (y = 0; y < h; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ dst [y * 16 + x] = byteclip((hpel_lpf(src + y * src_stride + (x - 2), 1) + 16) >> 5);
+ }
+ }
+}
+
+static void hpel_lpf_ver(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+ int y, x;
+ for (y = 0; y < h; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ dst [y * 16 + x] = byteclip((hpel_lpf(src + (y - 2) * src_stride + x, src_stride) + 16) >> 5);
+ }
+ }
+}
+
+static void average_16x16_unalign(uint8_t *dst, const uint8_t *src1, int src1_stride)
+{
+ int x, y;
+ for (y = 0; y < 16; y++)
+ {
+ for (x = 0; x < 16; x++)
+ {
+ dst[y * 16 + x] = (uint8_t)(((uint32_t)dst [y * 16 + x] + src1[y*src1_stride + x] + 1) >> 1);
+ }
+ }
+}
+
+static void h264e_qpel_average_wh_align(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, point_t wh)
+{
+ int w = wh.s.x;
+ int h = wh.s.y;
+ int x, y;
+ for (y = 0; y < h; y++)
+ {
+ for (x = 0; x < w; x++)
+ {
+ dst[y * 16 + x] = (uint8_t)((src0[y * 16 + x] + src1[y * 16 + x] + 1) >> 1);
+ }
+ }
+}
+
+static void h264e_qpel_interpolate_luma(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+ ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+ // src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride; // dx == 3 ? next row; dy == 3 ? next line
+ // dxdy actions: Horizontal, Vertical, Diagonal, Average
+ // 0 1 2 3 +1 - ha h ha+
+ // 1 va hva hda hv+a
+ // 2 v vda d v+da
+ // 3 va+ h+va h+da h+v+a
+ // +stride
+ int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+ int dstused = 0;
+
+ if (pos == 1)
+ {
+ copy_wh(src, src_stride, dst, wh.s.x, wh.s.y);
+ return;
+ }
+ if (pos & 0xe0ee)// 1110 0000 1110 1110
+ {
+ hpel_lpf_hor(src + ((pos & 0xe000) ? src_stride : 0), src_stride, dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0xbbb0)// 1011 1011 1011 0000
+ {
+ hpel_lpf_ver(src + ((pos & 0x8880) ? 1 : 0), src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0x4e40)// 0100 1110 0100 0000
+ {
+ hpel_lpf_diag(src, src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+ dstused++;
+ }
+ if (pos & 0xfafa)// 1111 1010 1111 1010
+ {
+ assert(wh.s.x == 16 && wh.s.y == 16);
+ if (dstused == 2)
+ {
+ point_t p;
+
+ src = scratch;
+ p.u32 = 16 + (16<<16);
+
+ h264e_qpel_average_wh_align(src, dst, dst, p);
+ return;
+ } else
+ {
+ src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+ }
+ average_16x16_unalign(dst, src, src_stride);
+ }
+}
+
+static void h264e_qpel_interpolate_chroma(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+ /* if fractionl mv is not (0, 0) */
+ if (dxdy.u32)
+ {
+ int a = (8 - dxdy.s.x) * (8 - dxdy.s.y);
+ int b = dxdy.s.x * (8 - dxdy.s.y);
+ int c = (8 - dxdy.s.x) * dxdy.s.y;
+ int d = dxdy.s.x * dxdy.s.y;
+ int h = wh.s.y;
+ do
+ {
+ int x;
+ for (x = 0; x < wh.s.x; x++)
+ {
+ dst[x] = (uint8_t)((
+ a * src[ x] + b * src[ x + 1] +
+ c * src[src_stride + x] + d * src[src_stride + x + 1] +
+ 32) >> 6);
+ }
+ dst += 16;
+ src += src_stride;
+ } while (--h);
+ } else
+ {
+ copy_wh(src, src_stride, dst, wh.s.x, wh.s.y);
+ }
+}
+
+static int sad_block(const pix_t *a, int a_stride, const pix_t *b, int b_stride, int w, int h)
+{
+ int r, c, sad = 0;
+ for (r = 0; r < h; r++)
+ {
+ for (c = 0; c < w; c++)
+ {
+ int d = a[c] - b[c];
+ sad += ABS(d);
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+static int h264e_sad_mb_unlaign_8x8(const pix_t *a, int a_stride, const pix_t *b, int sad[4])
+{
+ sad[0] = sad_block(a, a_stride, b, 16, 8, 8);
+ sad[1] = sad_block(a + 8, a_stride, b + 8, 16, 8, 8);
+ a += 8*a_stride;
+ b += 8*16;
+ sad[2] = sad_block(a, a_stride, b, 16, 8, 8);
+ sad[3] = sad_block(a + 8, a_stride, b + 8, 16, 8, 8);
+ return sad[0] + sad[1] + sad[2] + sad[3];
+}
+
+static int h264e_sad_mb_unlaign_wh(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+ return sad_block(a, a_stride, b, 16, wh.s.x, wh.s.y);
+}
+
+static void h264e_copy_8x8(pix_t *d, int d_stride, const pix_t *s)
+{
+ int cloop = 8;
+ assert(IS_ALIGNED(d, 8));
+ assert(IS_ALIGNED(s, 8));
+ do
+ {
+ int a = ((const int*)s)[0];
+ int b = ((const int*)s)[1];
+ ((int*)d)[0] = a;
+ ((int*)d)[1] = b;
+ s += 16;
+ d += d_stride;
+ } while(--cloop);
+}
+
+static void h264e_copy_16x16(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+ int cloop = 16;
+ assert(IS_ALIGNED(d, 8));
+ assert(IS_ALIGNED(s, 8));
+ do
+ {
+ int a = ((const int*)s)[0];
+ int b = ((const int*)s)[1];
+ int x = ((const int*)s)[2];
+ int y = ((const int*)s)[3];
+ ((int*)d)[0] = a;
+ ((int*)d)[1] = b;
+ ((int*)d)[2] = x;
+ ((int*)d)[3] = y;
+ s += s_stride;
+ d += d_stride;
+ } while(--cloop);
+}
+#endif /* H264E_ENABLE_PLAIN_C */
+
+#if H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM))
+static void h264e_copy_borders(unsigned char *pic, int w, int h, int guard)
+{
+ int r, rowbytes = w + 2*guard;
+ unsigned char *d = pic - guard;
+ for (r = 0; r < h; r++, d += rowbytes)
+ {
+ memset(d, d[guard], guard);
+ memset(d + rowbytes - guard, d[rowbytes - guard - 1], guard);
+ }
+ d = pic - guard - guard*rowbytes;
+ for (r = 0; r < guard; r++)
+ {
+ memcpy(d, pic - guard, rowbytes);
+ memcpy(d + (guard + h)*rowbytes, pic - guard + (h - 1)*rowbytes, rowbytes);
+ d += rowbytes;
+ }
+}
+#endif /* H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM)) */
+
+#if H264E_ENABLE_PLAIN_C
+#undef TRANSPOSE_BLOCK
+#define TRANSPOSE_BLOCK 1
+#define UNZIGSAG_IN_QUANT 0
+#define SUM_DIF(a, b) { int t = a + b; b = a - b; a = t; }
+
+static int clip_byte(int x)
+{
+ if (x > 255)
+ {
+ x = 255;
+ } else if (x < 0)
+ {
+ x = 0;
+ }
+ return x;
+}
+
+static void hadamar4_2d(int16_t *x)
+{
+ int s = 1;
+ int sback = 1;
+ int16_t tmp[16];
+ int16_t *out = tmp;
+ int16_t *p = x;
+ do
+ {
+ int cloop = 4;
+ do
+ {
+ int a, b, c, d;
+ a = *p; p += 4;//s;
+ b = *p; p += 4;//s;
+ c = *p; p += 4;//s;
+ d = *p; p -= 11;//sback;
+ SUM_DIF(a, c);
+ SUM_DIF(b, d);
+ SUM_DIF(a, b);
+ SUM_DIF(c, d);
+
+ *out = (int16_t)a; out += s;
+ *out = (int16_t)c; out += s;
+ *out = (int16_t)d; out += s;
+ *out = (int16_t)b; out += sback;
+ } while (--cloop);
+ s = 5 - s;
+ sback = -11;
+ out = x;
+ p = tmp;
+ } while (s != 1);
+}
+
+static void dequant_dc(quant_t *q, int16_t *qval, int dequant, int n)
+{
+ do q++->dq[0] = (int16_t)(*qval++ * (int16_t)dequant); while (--n);
+}
+
+static void quant_dc(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+#if UNZIGSAG_IN_QUANT
+ int r_minus = (1 << 18) - round_q18;
+ static const uint8_t iscan16[16] = {0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15};
+ static const uint8_t iscan4[4] = {0, 1, 2, 3};
+ const uint8_t *scan = n == 4 ? iscan4 : iscan16;
+ do
+ {
+ int v = *qval;
+ int r = v < 0 ? r_minus : round_q18;
+ deq[*scan++] = *qval++ = (v * quant + r) >> 18;
+ } while (--n);
+#else
+ int r_minus = (1<<18) - round_q18;
+ do
+ {
+ int v = *qval;
+ int r = v < 0 ? r_minus : round_q18;
+ *deq++ = *qval++ = (v * quant + r) >> 18;
+ } while (--n);
+#endif
+}
+
+static void hadamar2_2d(int16_t *x)
+{
+ int a = x[0];
+ int b = x[1];
+ int c = x[2];
+ int d = x[3];
+ x[0] = (int16_t)(a + b + c + d);
+ x[1] = (int16_t)(a - b + c - d);
+ x[2] = (int16_t)(a + b - c - d);
+ x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar4_2d(tmp);
+ quant_dc(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+ hadamar4_2d(tmp);
+ assert(!(qdat[1] & 3));
+ // dirty trick here: shift w/o rounding, since it have no effect for qp >= 10 (or, to be precise, for qp => 9)
+ dequant_dc(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+ int16_t *tmp = ((int16_t*)q) - 16;
+ hadamar2_2d(tmp);
+ quant_dc(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+ hadamar2_2d(tmp);
+ assert(!(qdat[1] & 1));
+ dequant_dc(q, tmp, qdat[1] >> 1, 4);
+ return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+static const uint8_t g_idx2quant[16] =
+{
+ 0, 2, 0, 2,
+ 2, 4, 2, 4,
+ 0, 2, 0, 2,
+ 2, 4, 2, 4
+};
+
+#define TRANSFORM(x0, x1, x2, x3, p, s) { \
+ int t0 = x0 + x3; \
+ int t1 = x0 - x3; \
+ int t2 = x1 + x2; \
+ int t3 = x1 - x2; \
+ (p)[ 0] = (int16_t)(t0 + t2); \
+ (p)[ s] = (int16_t)(t1*2 + t3); \
+ (p)[2*s] = (int16_t)(t0 - t2); \
+ (p)[3*s] = (int16_t)(t1 - t3*2); \
+}
+
+static void FwdTransformResidual4x42(const uint8_t *inp, const uint8_t *pred,
+ uint32_t inp_stride, int16_t *out)
+{
+ int i;
+ int16_t tmp[16];
+
+#if TRANSPOSE_BLOCK
+ // Transform columns
+ for (i = 0; i < 4; i++, pred++, inp++)
+ {
+ int f0 = inp[0] - pred[0];
+ int f1 = inp[1*inp_stride] - pred[1*16];
+ int f2 = inp[2*inp_stride] - pred[2*16];
+ int f3 = inp[3*inp_stride] - pred[3*16];
+ TRANSFORM(f0, f1, f2, f3, tmp + i*4, 1);
+ }
+ // Transform rows
+ for (i = 0; i < 4; i++)
+ {
+ int d0 = tmp[i + 0];
+ int d1 = tmp[i + 4];
+ int d2 = tmp[i + 8];
+ int d3 = tmp[i + 12];
+ TRANSFORM(d0, d1, d2, d3, out + i, 4);
+ }
+
+#else
+ /* Transform rows */
+ for (i = 0; i < 16; i += 4)
+ {
+ int d0 = inp[0] - pred[0];
+ int d1 = inp[1] - pred[1];
+ int d2 = inp[2] - pred[2];
+ int d3 = inp[3] - pred[3];
+ TRANSFORM(d0, d1, d2, d3, tmp + i, 1);
+ pred += 16;
+ inp += inp_stride;
+ }
+
+ /* Transform columns */
+ for (i = 0; i < 4; i++)
+ {
+ int f0 = tmp[i + 0];
+ int f1 = tmp[i + 4];
+ int f2 = tmp[i + 8];
+ int f3 = tmp[i + 12];
+ TRANSFORM(f0, f1, f2, f3, out + i, 4);
+ }
+#endif
+}
+
+static void TransformResidual4x4(int16_t *pSrc)
+{
+ int i;
+ int16_t tmp[16];
+
+ /* Transform rows */
+ for (i = 0; i < 16; i += 4)
+ {
+#if TRANSPOSE_BLOCK
+ int d0 = pSrc[(i >> 2) + 0];
+ int d1 = pSrc[(i >> 2) + 4];
+ int d2 = pSrc[(i >> 2) + 8];
+ int d3 = pSrc[(i >> 2) + 12];
+#else
+ int d0 = pSrc[i + 0];
+ int d1 = pSrc[i + 1];
+ int d2 = pSrc[i + 2];
+ int d3 = pSrc[i + 3];
+#endif
+ int e0 = d0 + d2;
+ int e1 = d0 - d2;
+ int e2 = (d1 >> 1) - d3;
+ int e3 = d1 + (d3 >> 1);
+ int f0 = e0 + e3;
+ int f1 = e1 + e2;
+ int f2 = e1 - e2;
+ int f3 = e0 - e3;
+ tmp[i + 0] = (int16_t)f0;
+ tmp[i + 1] = (int16_t)f1;
+ tmp[i + 2] = (int16_t)f2;
+ tmp[i + 3] = (int16_t)f3;
+ }
+
+ /* Transform columns */
+ for (i = 0; i < 4; i++)
+ {
+ int f0 = tmp[i + 0];
+ int f1 = tmp[i + 4];
+ int f2 = tmp[i + 8];
+ int f3 = tmp[i + 12];
+ int g0 = f0 + f2;
+ int g1 = f0 - f2;
+ int g2 = (f1 >> 1) - f3;
+ int g3 = f1 + (f3 >> 1);
+ int h0 = g0 + g3;
+ int h1 = g1 + g2;
+ int h2 = g1 - g2;
+ int h3 = g0 - g3;
+ pSrc[i + 0] = (int16_t)((h0 + 32) >> 6);
+ pSrc[i + 4] = (int16_t)((h1 + 32) >> 6);
+ pSrc[i + 8] = (int16_t)((h2 + 32) >> 6);
+ pSrc[i + 12] = (int16_t)((h3 + 32) >> 6);
+ }
+}
+
+static int is_zero(const int16_t *dat, int i0, const uint16_t *thr)
+{
+ int i;
+ for (i = i0; i < 16; i++)
+ {
+ if ((unsigned)(dat[i] + thr[i & 7]) > (unsigned)2*thr[i & 7])
+ {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int is_zero4(const quant_t *q, int i0, const uint16_t *thr)
+{
+ return is_zero(q[0].dq, i0, thr) &&
+ is_zero(q[1].dq, i0, thr) &&
+ is_zero(q[4].dq, i0, thr) &&
+ is_zero(q[5].dq, i0, thr);
+}
+
+static int zero_smallq(quant_t *q, int mode, const uint16_t *qdat)
+{
+ int zmask = 0;
+ int i, i0 = mode & 1, n = mode >> 1;
+ if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+ {
+ for (i = 0; i < n*n; i++)
+ {
+ if (is_zero(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+ {
+ zmask |= (1 << i); //9.19
+ }
+ }
+ if (mode == QDQ_MODE_INTER) //8.27
+ {
+ if ((~zmask & 0x0033) && is_zero4(q + 0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+ if ((~zmask & 0x00CC) && is_zero4(q + 2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+ if ((~zmask & 0x3300) && is_zero4(q + 8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+ if ((~zmask & 0xCC00) && is_zero4(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+ }
+ }
+ return zmask;
+}
+
+static int quantize(quant_t *q, int mode, const uint16_t *qdat, int zmask)
+{
+#if UNZIGSAG_IN_QUANT
+#if TRANSPOSE_BLOCK
+ // ; Zig-zag scan Transposed zig-zag
+ // ; 0 1 5 6 0 2 3 9
+ // ; 2 4 7 C 1 4 8 A
+ // ; 3 8 B D 5 7 B E
+ // ; 9 A E F 6 C D F
+ static const unsigned char iscan16[16] = { 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15 };
+#else
+ static const unsigned char iscan16[16] = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
+#endif
+#endif
+ int i, i0 = mode & 1, ccol, crow;
+ int nz_block_mask = 0;
+ ccol = mode >> 1;
+ crow = ccol;
+ do
+ {
+ do
+ {
+ int nz_mask = 0;
+
+ if (zmask & 1)
+ {
+ int32_t *p = (int32_t *)q->qv;
+ *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+ *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+ USED(p);
+ } else
+ {
+ for (i = i0; i < 16; i++)
+ {
+ int off = g_idx2quant[i];
+ int v, round = qdat[OFFS_RND_INTER];
+
+ if (q->dq[i] < 0) round = 0xFFFF - round;
+
+ v = (q->dq[i]*qdat[off] + round) >> 16;
+#if UNZIGSAG_IN_QUANT
+ if (v)
+ nz_mask |= 1 << iscan16[i];
+ q->qv[iscan16[i]] = (int16_t)v;
+#else
+ if (v)
+ nz_mask |= 1 << i;
+ q->qv[i] = (int16_t)v;
+#endif
+ q->dq[i] = (int16_t)(v*qdat[off + 1]);
+ }
+ }
+
+ zmask >>= 1;
+ nz_block_mask <<= 1;
+ if (nz_mask)
+ nz_block_mask |= 1;
+ q++;
+ } while (--ccol);
+ ccol = mode >> 1;
+ } while (--crow);
+ return nz_block_mask;
+}
+
+static void transform(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q)
+{
+ int crow = mode >> 1;
+ int ccol = crow;
+
+ do
+ {
+ do
+ {
+ FwdTransformResidual4x42(inp, pred, inp_stride, q->dq);
+ q++;
+ inp += 4;
+ pred += 4;
+ } while (--ccol);
+ ccol = mode >> 1;
+ inp += 4*(inp_stride - ccol);
+ pred += 4*(16 - ccol);
+ } while (--crow);
+}
+
+static int h264e_transform_sub_quant_dequant(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+ int zmask;
+ transform(inp, pred, inp_stride, mode, q);
+ if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+ {
+ int cloop = (mode >> 1)*(mode >> 1);
+ short *dc = ((short *)q) - 16;
+ quant_t *pq = q;
+ do
+ {
+ *dc++ = pq->dq[0];
+ pq++;
+ } while (--cloop);
+ }
+ zmask = zero_smallq(q, mode, qdat);
+ return quantize(q, mode, qdat, zmask);
+}
+
+static void h264e_transform_add(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+ int crow = side;
+ int ccol = crow;
+
+ assert(IS_ALIGNED(out, 4));
+ assert(IS_ALIGNED(pred, 4));
+ assert(!(out_stride % 4));
+
+ do
+ {
+ do
+ {
+ if (mask >= 0)
+ {
+ // copy 4x4
+ pix_t *dst = out;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+ *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+ } else
+ {
+ int i, j;
+ TransformResidual4x4(q->dq);
+ for (j = 0; j < 4; j++)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ int Value = q->dq[j * 4 + i] + pred[j * 16 + i];
+ out[j * out_stride + i] = (pix_t)clip_byte(Value);
+ }
+ }
+ }
+ mask = (uint32_t)mask << 1;
+ q++;
+ out += 4;
+ pred += 4;
+ } while (--ccol);
+ ccol = side;
+ out += 4*(out_stride - ccol);
+ pred += 4*(16 - ccol);
+ } while (--crow);
+}
+#endif /* H264E_ENABLE_PLAIN_C */
+
+#if H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM))
+
+#define BS_BITS 32
+
+static void h264e_bs_put_bits(bs_t *bs, unsigned n, unsigned val)
+{
+ assert(!(val >> n));
+ bs->shift -= n;
+ assert((unsigned)n <= 32);
+ if (bs->shift < 0)
+ {
+ assert(-bs->shift < 32);
+ bs->cache |= val >> -bs->shift;
+ *bs->buf++ = SWAP32(bs->cache);
+ bs->shift = 32 + bs->shift;
+ bs->cache = 0;
+ }
+ bs->cache |= val << bs->shift;
+}
+
+static void h264e_bs_flush(bs_t *bs)
+{
+ *bs->buf = SWAP32(bs->cache);
+}
+
+static unsigned h264e_bs_get_pos_bits(const bs_t *bs)
+{
+ unsigned pos_bits = (unsigned)((bs->buf - bs->origin)*BS_BITS);
+ pos_bits += BS_BITS - bs->shift;
+ assert((int)pos_bits >= 0);
+ return pos_bits;
+}
+
+static unsigned h264e_bs_byte_align(bs_t *bs)
+{
+ int pos = h264e_bs_get_pos_bits(bs);
+ h264e_bs_put_bits(bs, -pos & 7, 0);
+ return pos + (-pos & 7);
+}
+
+/**
+* Golomb code
+* 0 => 1
+* 1 => 01 0
+* 2 => 01 1
+* 3 => 001 00
+* 4 => 001 01
+*
+* [0] => 1
+* [1..2] => 01x
+* [3..6] => 001xx
+* [7..14] => 0001xxx
+*
+*/
+static void h264e_bs_put_golomb(bs_t *bs, unsigned val)
+{
+#ifdef __arm__
+ int size = 32 - __clz(val + 1);
+#else
+ int size = 0;
+ unsigned t = val + 1;
+ do
+ {
+ size++;
+ } while (t >>= 1);
+#endif
+ h264e_bs_put_bits(bs, 2*size - 1, val + 1);
+}
+
+/**
+* signed Golomb code.
+* mapping to unsigned code:
+* 0 => 0
+* 1 => 1
+* -1 => 2
+* 2 => 3
+* -2 => 4
+* 3 => 5
+* -3 => 6
+*/
+static void h264e_bs_put_sgolomb(bs_t *bs, int val)
+{
+ val = 2*val - 1;
+ val ^= val >> 31;
+ h264e_bs_put_golomb(bs, val);
+}
+
+static void h264e_bs_init_bits(bs_t *bs, void *data)
+{
+ bs->origin = data;
+ bs->buf = bs->origin;
+ bs->shift = BS_BITS;
+ bs->cache = 0;
+}
+
+static void h264e_vlc_encode(bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx)
+{
+ int nnz_context, nlevels, nnz; // nnz = nlevels + trailing_ones
+ int trailing_ones = 0;
+ int trailing_ones_sign = 0;
+ uint8_t runs[16];
+ uint8_t *prun = runs;
+ int16_t *levels;
+ int cloop = maxNumCoeff; USED(cloop);
+ BS_OPEN(bs)
+
+#if H264E_ENABLE_SSE2 || (H264E_ENABLE_PLAIN_C && !H264E_ENABLE_NEON)
+ // this branch used with SSE + C configuration
+ int16_t zzquant[16];
+ levels = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+ if (maxNumCoeff != 4)
+ {
+ int v;
+ if (maxNumCoeff == 16)
+ {
+ v = quant[15]*2; if (v) *--levels = (int16_t)v, *prun++ = 16;
+ v = quant[11]*2; if (v) *--levels = (int16_t)v, *prun++ = 15;
+ v = quant[14]*2; if (v) *--levels = (int16_t)v, *prun++ = 14;
+ v = quant[13]*2; if (v) *--levels = (int16_t)v, *prun++ = 13;
+ v = quant[10]*2; if (v) *--levels = (int16_t)v, *prun++ = 12;
+ v = quant[ 7]*2; if (v) *--levels = (int16_t)v, *prun++ = 11;
+ v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ = 10;
+ v = quant[ 6]*2; if (v) *--levels = (int16_t)v, *prun++ = 9;
+ v = quant[ 9]*2; if (v) *--levels = (int16_t)v, *prun++ = 8;
+ v = quant[12]*2; if (v) *--levels = (int16_t)v, *prun++ = 7;
+ v = quant[ 8]*2; if (v) *--levels = (int16_t)v, *prun++ = 6;
+ v = quant[ 5]*2; if (v) *--levels = (int16_t)v, *prun++ = 5;
+ v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ = 4;
+ v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ = 3;
+ v = quant[ 4]*2; if (v) *--levels = (int16_t)v, *prun++ = 2;
+ v = quant[ 0]*2; if (v) *--levels = (int16_t)v, *prun++ = 1;
+ } else
+ {
+ v = quant[15]*2; if (v) *--levels = (int16_t)v, *prun++ = 15;
+ v = quant[11]*2; if (v) *--levels = (int16_t)v, *prun++ = 14;
+ v = quant[14]*2; if (v) *--levels = (int16_t)v, *prun++ = 13;
+ v = quant[13]*2; if (v) *--levels = (int16_t)v, *prun++ = 12;
+ v = quant[10]*2; if (v) *--levels = (int16_t)v, *prun++ = 11;
+ v = quant[ 7]*2; if (v) *--levels = (int16_t)v, *prun++ = 10;
+ v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ = 9;
+ v = quant[ 6]*2; if (v) *--levels = (int16_t)v, *prun++ = 8;
+ v = quant[ 9]*2; if (v) *--levels = (int16_t)v, *prun++ = 7;
+ v = quant[12]*2; if (v) *--levels = (int16_t)v, *prun++ = 6;
+ v = quant[ 8]*2; if (v) *--levels = (int16_t)v, *prun++ = 5;
+ v = quant[ 5]*2; if (v) *--levels = (int16_t)v, *prun++ = 4;
+ v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ = 3;
+ v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ = 2;
+ v = quant[ 4]*2; if (v) *--levels = (int16_t)v, *prun++ = 1;
+ }
+ } else
+ {
+ int v;
+ v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ = 4;
+ v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ = 3;
+ v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ = 2;
+ v = quant[ 0]*2; if (v) *--levels = (int16_t)v, *prun++ = 1;
+ }
+ USED(prun);
+ quant = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+ nnz = (int)(quant - levels);
+#else
+ quant += (maxNumCoeff == 4) ? 4 : 16;
+ levels = quant;
+ do
+ {
+ int v = *--quant;
+ if (v)
+ {
+ *--levels = v*2;
+ *prun++ = cloop;
+ }
+ } while (--cloop);
+ quant += maxNumCoeff;
+ nnz = quant - levels;
+#endif
+
+ if (nnz)
+ {
+ cloop = MIN(3, nnz);
+ levels = quant - 1;
+ do
+ {
+ if ((unsigned)(*levels + 2) > 4u)
+ {
+ break;
+ }
+ trailing_ones_sign = (trailing_ones_sign << 1) | (*levels-- < 0);
+ trailing_ones++;
+ } while (--cloop);
+ }
+ nlevels = nnz - trailing_ones;
+
+ nnz_context = nz_ctx[-1] + nz_ctx[1];
+
+ nz_ctx[0] = (uint8_t)nnz;
+ if (nnz_context <= 34)
+ {
+ nnz_context = (nnz_context + 1) >> 1;
+ }
+ nnz_context &= 31;
+
+ // 9.2.1 Parsing process for total number of transform coefficient levels and trailing ones
+ {
+ int off = h264e_g_coeff_token[nnz_context];
+ int n = 6, val = h264e_g_coeff_token[off + trailing_ones + 4*nlevels];
+ if (off != 230)
+ {
+ n = (val & 15) + 1;
+ val >>= 4;
+ }
+ BS_PUT(n, val);
+ }
+
+ if (nnz)
+ {
+ if (trailing_ones)
+ {
+ BS_PUT(trailing_ones, trailing_ones_sign);
+ }
+ if (nlevels)
+ {
+ int vlcnum = 1;
+ int sym_len, prefix_len;
+
+ int sym = *levels-- - 2;
+ if (sym < 0) sym = -3 - sym;
+ if (sym >= 6) vlcnum++;
+ if (trailing_ones < 3)
+ {
+ sym -= 2;
+ if (nnz > 10)
+ {
+ sym_len = 1;
+ prefix_len = sym >> 1;
+ if (prefix_len >= 15)
+ {
+ // or vlcnum = 1; goto escape;
+ prefix_len = 15;
+ sym_len = 12;
+ }
+ sym -= prefix_len << 1;
+ // bypass vlcnum advance due to sym -= 2; above
+ goto loop_enter;
+ }
+ }
+
+ if (sym < 14)
+ {
+ prefix_len = sym;
+ sym = 0; // to avoid side effect in bitbuf
+ sym_len = 0;
+ } else if (sym < 30)
+ {
+ prefix_len = 14;
+ sym_len = 4;
+ sym -= 14;
+ } else
+ {
+ vlcnum = 1;
+ goto escape;
+ }
+ goto loop_enter;
+
+ for (;;)
+ {
+ sym_len = vlcnum;
+ prefix_len = sym >> vlcnum;
+ if (prefix_len >= 15)
+ {
+escape:
+ prefix_len = 15;
+ sym_len = 12;
+ }
+ sym -= prefix_len << vlcnum;
+
+ if (prefix_len >= 3 && vlcnum < 6)
+ vlcnum++;
+loop_enter:
+ sym |= 1 << sym_len;
+ sym_len += prefix_len + 1;
+ BS_PUT(sym_len, sym);
+ if (!--nlevels) break;
+ sym = *levels-- - 2;
+ if (sym < 0) sym = -3 - sym;
+ }
+ }
+
+ if (nnz < maxNumCoeff)
+ {
+ const uint8_t *vlc = (maxNumCoeff == 4) ? h264e_g_total_zeros_cr_2x2 : h264e_g_total_zeros;
+ uint8_t *run = runs;
+ int run_prev = *run++;
+ int nzeros = run_prev - nnz;
+ int zeros_left = 2*nzeros - 1;
+ int ctx = nnz - 1;
+ run[nnz - 1] = (uint8_t)maxNumCoeff; // terminator
+ for (;;)
+ {
+ int t;
+
+ int val = vlc[vlc[ctx] + nzeros];
+ int n = val & 15;
+ val >>= 4;
+ BS_PUT(n, val);
+
+ zeros_left -= nzeros;
+ if (zeros_left < 0)
+ {
+ break;
+ }
+
+ t = *run++;
+ nzeros = run_prev - t - 1;
+ if (nzeros < 0)
+ {
+ break;
+ }
+ run_prev = t;
+ assert(zeros_left < 14);
+ vlc = h264e_g_run_before;
+ ctx = zeros_left;
+ }
+ }
+ }
+ BS_CLOSE(bs);
+}
+#endif /* H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM)) */
+
+#if H264E_SVC_API
+static uint32_t udiv32(uint32_t n, uint32_t d)
+{
+ uint32_t q = 0, r = n, N = 16;
+ do
+ {
+ N--;
+ if ((r >> N) >= d)
+ {
+ r -= (d << N);
+ q += (1 << N);
+ }
+ } while (N);
+ return q;
+}
+
+static void h264e_copy_8x8_s(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+ int cloop = 8;
+ assert(!((unsigned)(uintptr_t)d & 7));
+ assert(!((unsigned)(uintptr_t)s & 7));
+ do
+ {
+ int a = ((const int*)s)[0];
+ int b = ((const int*)s)[1];
+ ((int*)d)[0] = a;
+ ((int*)d)[1] = b;
+ s += s_stride;
+ d += d_stride;
+ } while(--cloop);
+}
+
+static void h264e_frame_downsampling(uint8_t *out, int wo, int ho,
+ const uint8_t *src, int wi, int hi, int wo_Crop, int ho_Crop, int wi_Crop, int hi_Crop)
+{
+#define Q_BILIN 12
+#define ONE_BILIN (1<<Q_BILIN)
+ int r, c;
+ int scaleh = udiv32(hi_Crop<<Q_BILIN, ho_Crop);
+ int scalew = udiv32(wi_Crop<<Q_BILIN, wo_Crop);
+
+ for (r = 0; r < ho_Crop; r++)
+ {
+ int dy = r*scaleh + (scaleh >> 2);
+ int y = dy >> Q_BILIN;
+ dy = dy & (ONE_BILIN - 1);
+
+ for (c = 0; c < wo_Crop; c++)
+ {
+ int dx = c*scalew + (scalew >> 2);
+ // int dx = c*scalew;
+ int x = dx >> Q_BILIN;
+ const uint8_t *s0, *s1;
+ uint8_t s00, s01, s10, s11;
+ dx &= (ONE_BILIN - 1);
+
+
+ s1 = s0 = src + x + y*wi;
+ if (y < hi - 1)
+ {
+ s1 = s0 + wi;
+ }
+
+ s00 = s01 = s0[0];
+ s10 = s11 = s1[0];
+ if (x < wi - 1)
+ {
+ s01 = s0[1];
+ s11 = s1[1];
+ }
+
+ *out++ =(uint8_t) ((((s11*dx + s10*(ONE_BILIN - dx)) >> (Q_BILIN - 1))*dy +
+ ((s01*dx + s00*(ONE_BILIN - dx)) >> (Q_BILIN - 1))*(ONE_BILIN - dy) + (1 << (Q_BILIN + 1 - 1))) >> (Q_BILIN + 1));
+ }
+ if (wo > wo_Crop) //copy border
+ {
+ int cloop = wo - wo_Crop;
+ uint8_t border = out[-1];
+ do
+ {
+ *out++ = border;
+ } while(--cloop);
+ }
+ }
+
+ // copy bottom
+ {
+ int cloop = (ho - ho_Crop) * wo;
+ if (cloop > 0)
+ {
+ do
+ {
+ *out = out[-wo];
+ out++;
+ } while(--cloop);
+ }
+ }
+}
+
+static int clip(int val, int max)
+{
+ if (val < 0) return 0;
+ if (val > max) return max;
+ return val;
+}
+
+static const int8_t g_filter16_luma[16][4] =
+{
+ { 0, 32, 0, 0 },
+ { -1, 32, 2, -1 },
+ { -2, 31, 4, -1 },
+ { -3, 30, 6, -1 },
+ { -3, 28, 8, -1 },
+ { -4, 26, 11, -1 },
+ { -4, 24, 14, -2 },
+ { -3, 22, 16, -3 },
+ { -3, 19, 19, -3 },
+ { -3, 16, 22, -3 },
+ { -2, 14, 24, -4 },
+ { -1, 11, 26, -4 },
+ { -1, 8, 28, -3 },
+ { -1, 6, 30, -3 },
+ { -1, 4, 31, -2 },
+ { -1, 2, 32, -1 }
+};
+
+static void h264e_intra_upsampling(int srcw, int srch, int dstw, int dsth, int is_chroma,
+ const uint8_t *arg_src, int src_stride, uint8_t *arg_dst, int dst_stride)
+{
+ int i, j;
+ //===== set position calculation parameters =====
+ int shift_x = 16;//(m_iLevelIdc <= 30 ? 16 : 31 - CeilLog2(iBaseW));
+ int shift_y = 16;//(m_iLevelIdc <= 30 ? 16 : 31 - CeilLog2(iBaseH));
+ int step_x = udiv32(((unsigned int)srcw << shift_x) + (dstw >> 1), dstw);
+ int step_y = udiv32(((unsigned int)srch << shift_y) + (dsth >> 1), dsth);
+ int start_x = udiv32((srcw << (shift_x - 1 - is_chroma)) + (dstw >> 1), dstw) + (1 << (shift_x - 5));
+ int start_y = udiv32((srch << (shift_y - 1 - is_chroma)) + (dsth >> 1), dsth) + (1 << (shift_y - 5));
+ int16_t *temp16 = (short*)(arg_dst + dst_stride*dsth) + 4; // malloc(( iBaseH )*sizeof(short)); //ref frame have border =1 mb
+
+ if (is_chroma)
+ {
+ int xpos = start_x - (4 << 12);
+ for (i = 0; i < dstw; i++, xpos += step_x)
+ {
+ const uint8_t* src = arg_src;
+ int xfrac = (xpos >> 12) & 15;
+ int xint = xpos >> 16;
+ int m0 = clip(xint + 0, srcw - 1);
+ int m1 = clip(xint + 1, srcw - 1);
+ for( j = 0; j < srch ; j++ )
+ {
+ temp16[j] = (int16_t)(src[m1]*xfrac + src[m0]*(16 - xfrac));
+ src += src_stride;
+ }
+ temp16[-1] = temp16[0];
+ temp16[srch] = temp16[srch-1];
+
+ //========== vertical upsampling ===========
+ {
+ int16_t* src16 = temp16;
+ uint8_t* dst = arg_dst + i;
+ int ypos = start_y - (4 << 12);
+ for (j = 0; j < dsth; j++)
+ {
+ int yfrac = (ypos >> 12) & 15;
+ int yint = (ypos >> 16);
+ int acc = yfrac*src16[yint + 1] + (16 - yfrac)*src16[yint + 0];
+ acc = (acc + 128) >> 8;
+ *dst = (int8_t)acc;
+ dst += dst_stride;
+ ypos += step_y;
+ }
+ }
+ }
+ } else
+ {
+ int xpos = start_x - (8 << 12);
+ for (i = 0; i < dstw; i++, xpos += step_x)
+ {
+ const uint8_t *src = arg_src;
+ int xfrac = (xpos >> 12) & 15;
+ int xint = xpos >> 16;
+ int m0 = clip(xint - 1, srcw - 1);
+ int m1 = clip(xint , srcw - 1);
+ int m2 = clip(xint + 1, srcw - 1);
+ int m3 = clip(xint + 2, srcw - 1);
+ //========== horizontal upsampling ===========
+ for( j = 0; j < srch ; j++ )
+ {
+ int acc = 0;
+ acc += g_filter16_luma[xfrac][0] * src[m0];
+ acc += g_filter16_luma[xfrac][1] * src[m1];
+ acc += g_filter16_luma[xfrac][2] * src[m2];
+ acc += g_filter16_luma[xfrac][3] * src[m3];
+ temp16[j] = (int16_t)acc;
+ src += src_stride;
+ }
+ temp16[-2] = temp16[-1] = temp16[0];
+ temp16[srch + 1] = temp16[srch] = temp16[srch - 1];
+
+ //========== vertical upsampling ===========
+ {
+ int16_t *src16 = temp16;
+ uint8_t *dst = arg_dst + i;
+ int ypos = start_y - (8 << 12);
+
+ for (j = 0; j < dsth; j++)
+ {
+ int yfrac = (ypos >> 12) & 15;
+ int yint = ypos >> 16;
+ int acc = 512;
+ acc += g_filter16_luma[yfrac][0] * src16[yint + 0 - 1];
+ acc += g_filter16_luma[yfrac][1] * src16[yint + 1 - 1];
+ acc += g_filter16_luma[yfrac][2] * src16[yint + 2 - 1];
+ acc += g_filter16_luma[yfrac][3] * src16[yint + 3 - 1];
+ acc >>= 10;
+ if (acc < 0)
+ {
+ acc = 0;
+ }
+ if (acc > 255)
+ {
+ acc = 255;
+ }
+ *dst = (int8_t)acc;
+ dst += dst_stride;
+ ypos += step_y;
+ }
+ }
+ }
+ }
+}
+#endif /* H264E_SVC_API */
+
+// Experimental code branch:
+// Rate-control takes into account that long-term references compresses worser than short-term
+#define H264E_RATE_CONTROL_GOLDEN_FRAMES 1
+
+/************************************************************************/
+/* Constants (can't be changed) */
+/************************************************************************/
+
+#define MIN_QP 10 // Minimum QP
+
+#define MVPRED_MEDIAN 1
+#define MVPRED_L 2
+#define MVPRED_U 3
+#define MVPRED_UR 4
+#define MV_NA 0x8000
+#define AVAIL(mv) ((mv).u32 != MV_NA)
+
+#define SLICE_TYPE_P 0
+#define SLICE_TYPE_I 2
+
+#define NNZ_NA 64
+
+#define MAX_MV_CAND 20
+
+#define STARTCODE_4BYTES 4
+
+#define SCALABLE_BASELINE 83
+
+/************************************************************************/
+/* Hardcoded params (can be changed at compile time) */
+/************************************************************************/
+#define ALPHA_OFS 0 // Deblock alpha offset
+#define BETA_OFS 0 // Deblock beta offset
+#define DQP_CHROMA 0 // chroma delta QP
+
+#define MV_RANGE 32 // Motion vector search range, pixels
+#define MV_GUARD 14 // Out-of-frame MV's restriction, pixels
+
+/************************************************************************/
+/* Code shortcuts */
+/************************************************************************/
+#define U(n,v) h264e_bs_put_bits(enc->bs, n, v)
+#define U1(v) h264e_bs_put_bits(enc->bs, 1, v)
+#define UE(v) h264e_bs_put_golomb(enc->bs, v)
+#define SE(v) h264e_bs_put_sgolomb(enc->bs, v)
+#define SWAP(datatype, a, b) { datatype _ = a; a = b; b = _; }
+#define SQR(x) ((x)*(x))
+#define SQRP(pnt) SQR(pnt.s.x) + SQR(pnt.s.y)
+#define SMOOTH(smth, p) smth.s.x = (63*smth.s.x + p.s.x + 32) >> 6; smth.s.y = (63*smth.s.y + p.s.y + 32) >> 6;
+#define MUL_LAMBDA(x, lambda) ((x)*(lambda) >> 4)
+
+/************************************************************************/
+/* Optimized code fallback */
+/************************************************************************/
+
+#if defined(MINIH264_ASM)
+#include "asm/minih264e_asm.h"
+#endif
+#if H264E_ENABLE_NEON && defined(MINIH264_ASM)
+# define h264e_bs_put_bits_neon h264e_bs_put_bits_arm11
+# define h264e_bs_flush_neon h264e_bs_flush_arm11
+# define h264e_bs_get_pos_bits_neon h264e_bs_get_pos_bits_arm11
+# define h264e_bs_byte_align_neon h264e_bs_byte_align_arm11
+# define h264e_bs_put_golomb_neon h264e_bs_put_golomb_arm11
+# define h264e_bs_put_sgolomb_neon h264e_bs_put_sgolomb_arm11
+# define h264e_bs_init_bits_neon h264e_bs_init_bits_arm11
+# define h264e_vlc_encode_neon h264e_vlc_encode_arm11
+#elif H264E_ENABLE_NEON
+# define h264e_bs_put_bits_neon h264e_bs_put_bits
+# define h264e_bs_flush_neon h264e_bs_flush
+# define h264e_bs_get_pos_bits_neon h264e_bs_get_pos_bits
+# define h264e_bs_byte_align_neon h264e_bs_byte_align
+# define h264e_bs_put_golomb_neon h264e_bs_put_golomb
+# define h264e_bs_put_sgolomb_neon h264e_bs_put_sgolomb
+# define h264e_bs_init_bits_neon h264e_bs_init_bits
+# define h264e_vlc_encode_neon h264e_vlc_encode
+# define h264e_copy_borders_neon h264e_copy_borders
+#endif
+
+/************************************************************************/
+/* Declare exported functions for each configuration */
+/************************************************************************/
+#if !H264E_CONFIGS_COUNT
+# error no build configuration defined
+#elif H264E_CONFIGS_COUNT == 1
+// Exactly one configuration: append config suffix to exported names
+# if H264E_ENABLE_NEON
+# define MAP_NAME(name) name##_neon
+# endif
+# if H264E_ENABLE_SSE2
+# define MAP_NAME(name) name##_sse2
+# endif
+#else //if H264E_CONFIGS_COUNT > 1
+// Several configurations: use Virtual Functions Table (VFT)
+typedef struct
+{
+# define H264E_API(type, name, args) type (*name) args;
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int, h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void, h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void, h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void, h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void, h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void, h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void, h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int, h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int, h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int, h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int, h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+# undef H264E_API
+} vft_t;
+
+// non-const VFT, run-time initialized
+static const vft_t *g_vft;
+
+// const VFT for each supported build config
+#if H264E_ENABLE_PLAIN_C
+static const vft_t g_vft_plain_c =
+{
+#define H264E_API(type, name, args) name,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int, h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void, h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void, h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void, h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void, h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void, h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void, h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int, h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int, h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int, h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int, h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+#if H264E_ENABLE_NEON
+static const vft_t g_vft_neon =
+{
+#define H264E_API(type, name, args) name##_neon,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int, h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void, h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void, h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void, h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void, h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void, h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void, h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int, h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int, h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int, h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int, h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+#if H264E_ENABLE_SSE2
+static const vft_t g_vft_sse2 =
+{
+#define H264E_API(type, name, args) name##_sse2,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int, h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void, h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void, h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void, h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void, h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void, h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void, h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int, h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int, h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int, h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int, h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+
+/************************************************************************/
+/* Code to detect CPU features and init VFT */
+/************************************************************************/
+
+#if H264E_ENABLE_SSE2
+#if defined(_MSC_VER)
+#define minih264_cpuid __cpuid
+#else
+static __inline__ __attribute__((always_inline)) void minih264_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+ __asm__ __volatile__(
+#if defined(__x86_64__)
+ "push %%rbx\n"
+ "cpuid\n"
+ "xchgl %%ebx, %1\n"
+ "pop %%rbx\n"
+#else /* defined(__x86_64__) */
+ "xchgl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %%ebx, %1\n"
+#endif /* defined(__x86_64__) */
+ : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+ : "a" (InfoType));
+#else /* defined(__PIC__) */
+ __asm__ __volatile__(
+ "cpuid"
+ : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+ : "a" (InfoType));
+#endif /* defined(__PIC__)*/
+}
+#endif /* defined(_MSC_VER) */
+
+static int CPU_have_SSE2()
+{
+ int CPUInfo[4];
+ minih264_cpuid(CPUInfo, 0);
+ if (CPUInfo[0] > 0)
+ {
+ minih264_cpuid(CPUInfo, 1);
+ if (CPUInfo[3] & (1 << 26))
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+static void init_vft(int enableNEON)
+{
+#if H264E_ENABLE_PLAIN_C
+ g_vft = &g_vft_plain_c;
+#endif
+ (void)enableNEON;
+#if H264E_ENABLE_NEON
+ if (enableNEON)
+ g_vft = &g_vft_neon;
+ else
+ g_vft = &g_vft_plain_c;
+#endif
+#if H264E_ENABLE_SSE2
+ if (CPU_have_SSE2())
+ {
+ g_vft = &g_vft_sse2;
+ }
+#endif
+}
+
+#define MAP_NAME(name) g_vft->name
+
+#endif
+
+#ifdef MAP_NAME
+# define h264e_qpel_interpolate_chroma MAP_NAME(h264e_qpel_interpolate_chroma)
+# define h264e_qpel_interpolate_luma MAP_NAME(h264e_qpel_interpolate_luma)
+# define h264e_qpel_average_wh_align MAP_NAME(h264e_qpel_average_wh_align)
+# define h264e_deblock_chroma MAP_NAME(h264e_deblock_chroma)
+# define h264e_deblock_luma MAP_NAME(h264e_deblock_luma)
+# define h264e_intra_predict_chroma MAP_NAME(h264e_intra_predict_chroma)
+# define h264e_intra_predict_16x16 MAP_NAME(h264e_intra_predict_16x16)
+# define h264e_intra_choose_4x4 MAP_NAME(h264e_intra_choose_4x4)
+# define h264e_bs_put_bits MAP_NAME(h264e_bs_put_bits)
+# define h264e_bs_flush MAP_NAME(h264e_bs_flush)
+# define h264e_bs_get_pos_bits MAP_NAME(h264e_bs_get_pos_bits)
+# define h264e_bs_byte_align MAP_NAME(h264e_bs_byte_align)
+# define h264e_bs_put_golomb MAP_NAME(h264e_bs_put_golomb)
+# define h264e_bs_put_sgolomb MAP_NAME(h264e_bs_put_sgolomb)
+# define h264e_bs_init_bits MAP_NAME(h264e_bs_init_bits)
+# define h264e_vlc_encode MAP_NAME(h264e_vlc_encode)
+# define h264e_sad_mb_unlaign_8x8 MAP_NAME(h264e_sad_mb_unlaign_8x8)
+# define h264e_sad_mb_unlaign_wh MAP_NAME(h264e_sad_mb_unlaign_wh)
+# define h264e_copy_8x8 MAP_NAME(h264e_copy_8x8)
+# define h264e_copy_16x16 MAP_NAME(h264e_copy_16x16)
+# define h264e_copy_borders MAP_NAME(h264e_copy_borders)
+# define h264e_transform_add MAP_NAME(h264e_transform_add)
+# define h264e_transform_sub_quant_dequant MAP_NAME(h264e_transform_sub_quant_dequant)
+# define h264e_quant_luma_dc MAP_NAME(h264e_quant_luma_dc)
+# define h264e_quant_chroma_dc MAP_NAME(h264e_quant_chroma_dc)
+# define h264e_denoise_run MAP_NAME(h264e_denoise_run)
+#endif
+
+/************************************************************************/
+/* Arithmetics */
+/************************************************************************/
+
+#ifndef __arm__
+/**
+* Count of leading zeroes
+*/
+static unsigned __clz(unsigned v)
+{
+#if defined(_MSC_VER)
+ unsigned long nbit;
+ _BitScanReverse(&nbit, v);
+ return 31 - nbit;
+#elif defined(__GNUC__) || defined(__clang__) || defined(__aarch64__)
+ return __builtin_clz(v);
+#else
+ unsigned clz = 32;
+ assert(v);
+ do
+ {
+ clz--;
+ } while (v >>= 1);
+ return clz;
+#endif
+}
+#endif
+
+/**
+* Size of unsigned Golomb code
+*/
+static int bitsize_ue(int v)
+{
+ return 2*(32 - __clz(v + 1)) - 1;
+}
+
+/**
+* Size of signed Golomb code
+*/
+static int bits_se(int v)
+{
+ v = 2*v - 1;
+ v ^= v >> 31;
+ return bitsize_ue(v);
+}
+
+/**
+* Multiply 32x32 Q16
+*/
+static uint32_t mul32x32shr16(uint32_t x, uint32_t y)
+{
+ uint32_t r = (x >> 16) * (y & 0xFFFFu) + x * (y >> 16) + ((y & 0xFFFFu) * (x & 0xFFFFu) >> 16);
+ //assert(r == (uint32_t)((__int64)x*y>>16));
+ return r;
+}
+
+/**
+* Integer division, producing Q16 output
+*/
+static uint32_t div_q16(uint32_t numer, uint32_t denum)
+{
+ unsigned f = 1 << __clz(denum);
+ do
+ {
+ denum = denum * f >> 16;
+ numer = mul32x32shr16(numer, f);
+ f = ((1 << 17) - denum);
+ } while (denum != 0xffff);
+ return numer;
+}
+
+/************************************************************************/
+/* Motion Vector arithmetics */
+/************************************************************************/
+
+static point_t point(int x, int y)
+{
+ point_t p;
+ p.u32 = ((unsigned)y << 16) | ((unsigned)x & 0xFFFF); // assumes little-endian
+ return p;
+}
+
+static int mv_is_zero(point_t p)
+{
+ return !p.u32;
+}
+
+static int mv_equal(point_t p0, point_t p1)
+{
+ return (p0.u32 == p1.u32);
+}
+
+/**
+* check that difference between given MV's components is greater than 3
+*/
+static int mv_differs3(point_t p0, point_t p1)
+{
+ return ABS(p0.s.x - p1.s.x) > 3 || ABS(p0.s.y - p1.s.y) > 3;
+}
+
+static point_t mv_add(point_t a, point_t b)
+{
+#if defined(__arm__)
+ a.u32 = __sadd16(a.u32, b.u32);
+#elif H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+ a.u32 = _mm_cvtsi128_si32(_mm_add_epi16(_mm_cvtsi32_si128(a.u32), _mm_cvtsi32_si128(b.u32)));
+#else
+ a.s.x += b.s.x;
+ a.s.y += b.s.y;
+#endif
+ return a;
+}
+
+static point_t mv_sub(point_t a, point_t b)
+{
+#if defined(__arm__)
+ a.u32 = __ssub16(a.u32, b.u32);
+#elif H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+ a.u32 = _mm_cvtsi128_si32(_mm_sub_epi16(_mm_cvtsi32_si128(a.u32), _mm_cvtsi32_si128(b.u32)));
+#else
+ a.s.x -= b.s.x;
+ a.s.y -= b.s.y;
+#endif
+ return a;
+}
+
+static void mv_clip(point_t *h264e_restrict p, const rectangle_t *range)
+{
+ p->s.x = MAX(p->s.x, range->tl.s.x);
+ p->s.x = MIN(p->s.x, range->br.s.x);
+ p->s.y = MAX(p->s.y, range->tl.s.y);
+ p->s.y = MIN(p->s.y, range->br.s.y);
+}
+
+static int mv_in_rect(point_t p, const rectangle_t *r)
+{
+ return (p.s.y >= r->tl.s.y && p.s.y <= r->br.s.y && p.s.x >= r->tl.s.x && p.s.x <= r->br.s.x);
+}
+
+static point_t mv_round_qpel(point_t p)
+{
+ return point((p.s.x + 1) & ~3, (p.s.y + 1) & ~3);
+}
+
+/************************************************************************/
+/* Misc macroblock helper functions */
+/************************************************************************/
+/**
+* @return current macroblock input luma pixels
+*/
+static pix_t *mb_input_luma(h264e_enc_t *enc)
+{
+ return enc->inp.yuv[0] + (enc->mb.x + enc->mb.y*enc->inp.stride[0])*16;
+}
+
+/**
+* @return current macroblock input chroma pixels
+*/
+static pix_t *mb_input_chroma(h264e_enc_t *enc, int uv)
+{
+ return enc->inp.yuv[uv] + (enc->mb.x + enc->mb.y*enc->inp.stride[uv])*8;
+}
+
+/**
+* @return absolute MV for current macroblock for given MV
+*/
+static point_t mb_abs_mv(h264e_enc_t *enc, point_t mv)
+{
+ return mv_add(mv, point(enc->mb.x*64, enc->mb.y*64));
+}
+
+/************************************************************************/
+/* Pixel copy functions */
+/************************************************************************/
+/**
+* Copy incomplete (cropped) macroblock pixels with borders extension
+*/
+static void pix_copy_cropped_mb(pix_t *d, int d_stride, const pix_t *s, int s_stride, int w, int h)
+{
+ int nbottom = d_stride - h; // assume dst = square d_strideXd_stride
+ s_stride -= w;
+ do
+ {
+ int cloop = w;
+ pix_t last;
+ do
+ {
+ last = *s++;
+ *d++ = last;
+ } while (--cloop);
+ cloop = d_stride - w;
+ if (cloop) do
+ {
+ *d++ = last; // extend row
+ } while (--cloop);
+ s += s_stride;
+ } while (--h);
+ s = d - d_stride;
+ if (nbottom) do
+ {
+ memcpy(d, s, d_stride); // extend columns
+ d += d_stride;
+ } while (--nbottom);
+}
+
+/**
+* Copy one image component
+*/
+static void pix_copy_pic(pix_t *dst, int dst_stride, pix_t *src, int src_stride, int w, int h)
+{
+ do
+ {
+ memcpy(dst, src, w);
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+}
+
+/**
+* Copy reconstructed frame to reference buffer, with borders extensionn
+*/
+static void pix_copy_recon_pic_to_ref(h264e_enc_t *enc)
+{
+ int c, h = enc->frame.h, w = enc->frame.w, guard = 16;
+ for (c = 0; c < 3; c++)
+ {
+ if (enc->param.const_input_flag)
+ {
+ SWAP(pix_t*, enc->ref.yuv[c], enc->dec.yuv[c]);
+ } else
+ {
+ pix_copy_pic(enc->ref.yuv[c], w + 2*guard, enc->dec.yuv[c], w, w, h);
+ }
+
+ h264e_copy_borders(enc->ref.yuv[c], w, h, guard);
+ if (!c) guard >>= 1, w >>= 1, h >>= 1;
+ }
+}
+
+/************************************************************************/
+/* Median MV predictor */
+/************************************************************************/
+
+/**
+* @return neighbors availability flags for current macroblock
+*/
+static int mb_avail_flag(const h264e_enc_t *enc)
+{
+ int nmb = enc->mb.num;
+ int flag = nmb >= enc->slice.start_mb_num + enc->frame.nmbx;
+ if (nmb >= enc->slice.start_mb_num + enc->frame.nmbx - 1 && enc->mb.x != enc->frame.nmbx-1)
+ {
+ flag += AVAIL_TR;
+ }
+ if (nmb != enc->slice.start_mb_num && enc->mb.x)
+ {
+ flag += AVAIL_L;
+ }
+ if (nmb > enc->slice.start_mb_num + enc->frame.nmbx && enc->mb.x)
+ {
+ flag += AVAIL_TL;
+ }
+ return flag;
+}
+
+/**
+* @return median of 3 given integers
+*/
+#if !(H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1))
+static int me_median_of_3(int a, int b, int c)
+{
+ return MAX(MIN(MAX(a, b), c), MIN(a, b));
+}
+#endif
+
+/**
+* @return median of 3 given motion vectors
+*/
+static point_t point_median_of_3(point_t a, point_t b, point_t c)
+{
+#if H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+ __m128i a2 = _mm_cvtsi32_si128(a.u32);
+ __m128i b2 = _mm_cvtsi32_si128(b.u32);
+ point_t med;
+ med.u32 = _mm_cvtsi128_si32(_mm_max_epi16(_mm_min_epi16(_mm_max_epi16(a2, b2), _mm_cvtsi32_si128(c.u32)), _mm_min_epi16(a2, b2)));
+ return med;
+#else
+ return point(me_median_of_3(a.s.x, b.s.x, c.s.x),
+ me_median_of_3(a.s.y, b.s.y, c.s.y));
+#endif
+}
+
+/**
+* Save state of the MV predictor
+*/
+static void me_mv_medianpredictor_save_ctx(h264e_enc_t *enc, point_t *ctx)
+{
+ int i;
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ for (i = 0; i < 4; i++)
+ {
+ *ctx++ = enc->mv_pred[i];
+ *ctx++ = enc->mv_pred[4 + i];
+ *ctx++ = mvtop[i];
+ }
+}
+
+/**
+* Restore state of the MV predictor
+*/
+static void me_mv_medianpredictor_restore_ctx(h264e_enc_t *enc, const point_t *ctx)
+{
+ int i;
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ for (i = 0; i < 4; i++)
+ {
+ enc->mv_pred[i] = *ctx++;
+ enc->mv_pred[4 + i] = *ctx++;
+ mvtop[i] = *ctx++;
+ }
+}
+
+/**
+* Put motion vector to the deblock filter matrix.
+* x,y,w,h refers to 4x4 blocks within 16x16 macroblock, and should be in the range [0,4]
+*/
+static void me_mv_dfmatrix_put(point_t *dfmv, int x, int y, int w, int h, point_t mv)
+{
+ int i;
+ assert(y < 4 && x < 4);
+
+ dfmv += y*5 + x + 5; // 5x5 matrix without left-top cell
+ do
+ {
+ for (i = 0; i < w; i++)
+ {
+ dfmv[i] = mv;
+ }
+ dfmv += 5;
+ } while (--h);
+}
+
+/**
+* Use given motion vector for prediction
+*/
+static void me_mv_medianpredictor_put(h264e_enc_t *enc, int x, int y, int w, int h, point_t mv)
+{
+ int i;
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ assert(y < 4 && x < 4);
+
+ enc->mv_pred[4 + y] = mvtop[x + w-1]; // top-left corner = top-right corner
+ for (i = 1; i < h; i++)
+ {
+ enc->mv_pred[4 + y + i] = mv; // top-left corner(s) for next row(s) = this
+ }
+ for (i = 0; i < h; i++)
+ {
+ enc->mv_pred[y + i] = mv; // left = this
+ }
+ for (i = 0; i < w; i++)
+ {
+ mvtop[x + i] = mv; // top = this
+ }
+}
+
+/**
+* Motion vector median predictor for non-skip macroblock, as defined in the standard
+*/
+static point_t me_mv_medianpredictor_get(const h264e_enc_t *enc, point_t xy, point_t wh)
+{
+ int x = xy.s.x >> 2;
+ int y = xy.s.y >> 2;
+ int w = wh.s.x >> 2;
+ int h = wh.s.y >> 2;
+ int mvPredType = MVPRED_MEDIAN;
+ point_t a, b, c, d, ret = point(0, 0);
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ int flag = enc->mb.avail;
+
+ assert(y < 4);
+ assert(x < 4);
+ assert(w <= 4);
+ assert(h <= 4);
+
+ a = enc->mv_pred[y];
+ b = mvtop[x];
+ c = mvtop[x + w];
+ d = enc->mv_pred[4 + y];
+
+ if (!x)
+ {
+ if (!(flag & AVAIL_L))
+ {
+ a.u32 = MV_NA;
+ }
+ if (!(flag & AVAIL_TL))
+ {
+ d.u32 = MV_NA;
+ }
+ }
+ if (!y)
+ {
+ if (!(flag & AVAIL_T))
+ {
+ b.u32 = MV_NA;
+ if (x + w < 4)
+ {
+ c.u32 = MV_NA;
+ }
+ if (x > 0)
+ {
+ d.u32 = MV_NA;
+ }
+ }
+ if (!(flag & AVAIL_TL) && !x)
+ {
+ d.u32 = MV_NA;
+ }
+ if (!(flag & AVAIL_TR) && x + w == 4)
+ {
+ c.u32 = MV_NA;
+ }
+ }
+
+ if (x + w == 4 && (!(flag & AVAIL_TR) || y))
+ {
+ c = d;
+ }
+
+ if (AVAIL(a) && !AVAIL(b) && !AVAIL(c))
+ {
+ mvPredType = MVPRED_L;
+ } else if (!AVAIL(a) && AVAIL(b) && !AVAIL(c))
+ {
+ mvPredType = MVPRED_U;
+ } else if (!AVAIL(a) && !AVAIL(b) && AVAIL(c))
+ {
+ mvPredType = MVPRED_UR;
+ }
+
+ // Directional predictions
+ if (w == 2 && h == 4)
+ {
+ if (x == 0)
+ {
+ if (AVAIL(a))
+ {
+ mvPredType = MVPRED_L;
+ }
+ } else
+ {
+ if (AVAIL(c))
+ {
+ mvPredType = MVPRED_UR;
+ }
+ }
+ } else if (w == 4 && h == 2)
+ {
+ if (y == 0)
+ {
+ if (AVAIL(b))
+ {
+ mvPredType = MVPRED_U;
+ }
+ } else
+ {
+ if (AVAIL(a))
+ {
+ mvPredType = MVPRED_L;
+ }
+ }
+ }
+
+ switch(mvPredType)
+ {
+ default:
+ case MVPRED_MEDIAN:
+ if (!(AVAIL(b) || AVAIL(c)))
+ {
+ if (AVAIL(a))
+ {
+ ret = a;
+ }
+ } else
+ {
+ if (!AVAIL(a))
+ {
+ a = ret;
+ }
+ if (!AVAIL(b))
+ {
+ b = ret;
+ }
+ if (!AVAIL(c))
+ {
+ c = ret;
+ }
+ ret = point_median_of_3(a, b, c);
+ }
+ break;
+ case MVPRED_L:
+ if (AVAIL(a))
+ {
+ ret = a;
+ }
+ break;
+ case MVPRED_U:
+ if (AVAIL(b))
+ {
+ ret = b;
+ }
+ break;
+ case MVPRED_UR:
+ if (AVAIL(c))
+ {
+ ret = c;
+ }
+ break;
+ }
+ return ret;
+}
+
+/**
+* Motion vector median predictor for skip macroblock
+*/
+static point_t me_mv_medianpredictor_get_skip(h264e_enc_t *enc)
+{
+ point_t pred_16x16 = me_mv_medianpredictor_get(enc, point(0, 0), point(16, 16));
+ enc->mb.mv_skip_pred = point(0, 0);
+ if (!(~enc->mb.avail & (AVAIL_L | AVAIL_T)))
+ {
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ if (!mv_is_zero(enc->mv_pred[0]) && !mv_is_zero(mvtop[0]))
+ {
+ enc->mb.mv_skip_pred = pred_16x16;
+ }
+ }
+ return pred_16x16;
+}
+
+/**
+* Get starting points candidates for MV search
+*/
+static int me_mv_medianpredictor_get_cand(const h264e_enc_t *enc, point_t *mv)
+{
+ point_t *mv0 = mv;
+ point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+ int flag = enc->mb.avail;
+ *mv++ = point(0, 0);
+ if ((flag & AVAIL_L) && AVAIL(enc->mv_pred[0]))
+ {
+ *mv++ = enc->mv_pred[0];
+ }
+ if ((flag & AVAIL_T) && AVAIL(mvtop[0]))
+ {
+ *mv++ = mvtop[0];
+ }
+ if ((flag & AVAIL_TR) && AVAIL(mvtop[4]))
+ {
+ *mv++ = mvtop[4];
+ }
+ return (int)(mv - mv0);
+}
+
+
+/************************************************************************/
+/* NAL encoding */
+/************************************************************************/
+
+/**
+* Count ## of escapes, i.e. binary strings 0000 0000 0000 0000 0000 00xx
+* P(escape) = 2^-22
+* E(run_between_escapes) = 2^21 ~= 2 MB
+*/
+static int nal_count_esc(const uint8_t *s, int n)
+{
+ int i, cnt_esc = 0, cntz = 0;
+ for (i = 0; i < n; i++)
+ {
+ uint8_t byte = *s++;
+ if (cntz == 2 && byte <= 3)
+ {
+ cnt_esc++;
+ cntz = 0;
+ }
+
+ if (byte)
+ {
+ cntz = 0;
+ } else
+ {
+ cntz++;
+ }
+ }
+ return cnt_esc;
+}
+
+/**
+* Put NAL escape codes to the output bitstream
+*/
+static int nal_put_esc(uint8_t *d, const uint8_t *s, int n)
+{
+ int i, j = 0, cntz = 0;
+ for (i = 0; i < n; i++)
+ {
+ uint8_t byte = *s++;
+ if (cntz == 2 && byte <= 3)
+ {
+ d[j++] = 3;
+ cntz = 0;
+ }
+
+ if (byte)
+ {
+ cntz = 0;
+ } else
+ {
+ cntz++;
+ }
+ d[j++] = byte;
+ }
+ assert(d + j <= s);
+ return j;
+}
+
+/**
+* Init NAL encoding
+*/
+static void nal_start(h264e_enc_t *enc, int nal_hdr)
+{
+ uint8_t *d = enc->out + enc->out_pos;
+ d[0] = d[1] = d[2] = 0; d[3] = 1; // start code
+ enc->out_pos += STARTCODE_4BYTES;
+ d += STARTCODE_4BYTES + (-(int)enc->out_pos & 3); // 4-bytes align for bitbuffer
+ assert(IS_ALIGNED(d, 4));
+ h264e_bs_init_bits(enc->bs, d);
+ U(8, nal_hdr);
+}
+
+/**
+* Finalize NAL encoding
+*/
+static void nal_end(h264e_enc_t *enc)
+{
+ int cnt_esc, bs_bytes;
+ uint8_t *nal = enc->out + enc->out_pos;
+
+ U1(1); // stop bit
+ bs_bytes = h264e_bs_byte_align(enc->bs) >> 3;
+ h264e_bs_flush(enc->bs);
+
+ // count # of escape bytes to insert
+ cnt_esc = nal_count_esc((unsigned char*)enc->bs->origin, bs_bytes);
+
+ if ((uint8_t *)enc->bs->origin != nal + cnt_esc)
+ {
+ // make free space for escapes and remove align bytes
+ memmove(nal + cnt_esc, enc->bs->origin, bs_bytes);
+ }
+ if (cnt_esc)
+ {
+ // insert escape bytes
+ bs_bytes = nal_put_esc(nal, nal + cnt_esc, bs_bytes);
+ }
+ if (enc->run_param.nalu_callback)
+ {
+ // Call application-supplied callback
+ enc->run_param.nalu_callback(nal, bs_bytes, enc->run_param.nalu_callback_token);
+ }
+ enc->out_pos += bs_bytes;
+}
+
+
+/************************************************************************/
+/* Top-level syntax elements (SPS,PPS,Slice) */
+/************************************************************************/
+
+/**
+* Encode Sequence Parameter Set (SPS)
+* ref: [1] 7.3.2.1.1
+*/
+
+//temp global
+#define dependency_id 1
+#define quality_id 0
+#define default_base_mode_flag 0
+#define log2_max_frame_num_minus4 1
+
+static void encode_sps(h264e_enc_t *enc, int profile_idc)
+{
+ struct limit_t
+ {
+ uint8_t level;
+ uint8_t constrains;
+ uint16_t max_fs;
+ uint16_t max_vbvdiv5;
+ uint32_t max_dpb;
+ };
+ static const struct limit_t limit [] = {
+ {10, 0xE0, 99, 175/5, 396},
+ {10, 0xF0, 99, 350/5, 396},
+ {11, 0xE0, 396, 500/5, 900},
+ {12, 0xE0, 396, 1000/5, 2376},
+ {13, 0xE0, 396, 2000/5, 2376},
+ {20, 0xE0, 396, 2000/5, 2376},
+ {21, 0xE0, 792, 4000/5, 4752},
+ {22, 0xE0, 1620, 4000/5, 8100},
+ {30, 0xE0, 1620, 10000/5, 8100},
+ {31, 0xE0, 3600, 14000/5, 18000},
+ {32, 0xE0, 5120, 20000/5, 20480},
+ {40, 0xE0, 8192, 25000/5, 32768},
+ {41, 0xE0, 8192, 62500/5, 32768},
+ {42, 0xE0, 8704, 62500/5, 34816},
+ {50, 0xE0, 22080, 135000/5, 110400},
+ {51, 0xE0, 36864, 240000/5, 184320}
+ };
+ const struct limit_t *plim = limit;
+
+ while (plim->level < 51 && (enc->frame.nmb > plim->max_fs ||
+ enc->param.vbv_size_bytes > plim->max_vbvdiv5*(5*1000/8) ||
+ (unsigned)(enc->frame.nmb*(enc->param.max_long_term_reference_frames + 1)) > plim->max_dpb))
+ {
+ plim++;
+ }
+
+ nal_start(enc, 0x67 | (profile_idc == SCALABLE_BASELINE)*8);
+ U(8, profile_idc); // profile, 66 = baseline
+ U(8, plim->constrains & ((profile_idc!= SCALABLE_BASELINE)*4)); // no constrains
+ U(8, plim->level);
+ //U(5, 0x1B); // sps_id|log2_max_frame_num_minus4|pic_order_cnt_type
+ //UE(0); // sps_id 1
+ UE(enc->param.sps_id);
+
+#if H264E_SVC_API
+ if(profile_idc== SCALABLE_BASELINE)
+ {
+ UE(1); //chroma_format_idc
+ UE(0); //bit_depth_luma_minus8
+ UE(0); //bit_depth_chroma_minus8)
+ U1(0); //qpprime_y_zero_transform_bypass_flag
+ U1(0); //seq_scaling_matrix_present_flag
+ }
+#endif
+ UE(log2_max_frame_num_minus4); // log2_max_frame_num_minus4 1 UE(0); // log2_max_frame_num_minus4 1
+ UE(2); // pic_order_cnt_type 011
+ UE(1 + enc->param.max_long_term_reference_frames); // num_ref_frames
+ U1(0); // gaps_in_frame_num_value_allowed_flag);
+ UE(((enc->param.width + 15) >> 4) - 1); // pic_width_in_mbs_minus1
+ UE(((enc->param.height + 15) >> 4) - 1); // pic_height_in_map_units_minus1
+ U(3, 6 + enc->frame.cropping_flag); // frame_mbs_only_flag|direct_8x8_inference_flag|frame_cropping_flag
+// U1(1); // frame_mbs_only_flag
+// U1(1); // direct_8x8_inference_flag
+// U1(frame_cropping_flag); // frame_cropping_flag
+ if (enc->frame.cropping_flag)
+ {
+ UE(0); // frame_crop_left_offset
+ UE((enc->frame.w - enc->param.width) >> 1); // frame_crop_right_offset
+ UE(0); // frame_crop_top_offset
+ UE((enc->frame.h - enc->param.height) >> 1); // frame_crop_bottom_offset
+ }
+ U1(0); // vui_parameters_present_flag
+
+#if H264E_SVC_API
+ if(profile_idc == SCALABLE_BASELINE)
+ {
+ U1(1); //(inter_layer_deblocking_filter_control_present_flag); //inter_layer_deblocking_filter_control_present_flag
+ U(2,0); //extended_spatial_scalability
+ U1(0); //chroma_phase_x_plus1_flag
+ U(2,0); //chroma_phase_y_plus1
+
+ /* if( sps->sps_ext.extended_spatial_scalability == 1 )
+ {
+ //if( ChromaArrayType > 0 )
+ {
+ put_bits( s, 1,0);
+ put_bits( s, 2,0); ///
+ }
+ put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_left_offset );
+ put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_top_offset );
+ put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_right_offset );
+ put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_bottom_offset );
+ }*/
+ U1(0); //seq_tcoeff_level_prediction_flag
+ U1(1); //slice_header_restriction_flag
+ U1(0); //svc_vui_parameters_present_flag
+ U1(0); //additional_extension2_flag
+ }
+#endif
+ nal_end(enc);
+}
+
+/**
+* Encode Picture Parameter Set (SPS)
+* ref: [1] 7.3.2.2
+*/
+static void encode_pps(h264e_enc_t *enc, int pps_id)
+{
+ nal_start(enc, 0x68);
+ // U(10, 0x338); // constant shortcut:
+ UE(enc->param.sps_id*4 + pps_id); // pic_parameter_set_id 1
+ UE(enc->param.sps_id); // seq_parameter_set_id 1
+ U1(0); // entropy_coding_mode_flag 0
+ U1(0); // pic_order_present_flag 0
+ UE(0); // num_slice_groups_minus1 1
+ UE(0); // num_ref_idx_l0_active_minus1 1
+ UE(0); // num_ref_idx_l1_active_minus1 1
+ U1(0); // weighted_pred_flag 0
+ U(2,0); // weighted_bipred_idc 00
+ SE(enc->sps.pic_init_qp - 26); // pic_init_qp_minus26
+#if DQP_CHROMA
+ SE(0); // pic_init_qs_minus26 1
+ SE(DQP_CHROMA); // chroma_qp_index_offset 1
+ U1(1); // deblocking_filter_control_present_flag 1
+ U1(0); // constrained_intra_pred_flag 0
+ U1(0); // redundant_pic_cnt_present_flag 0
+#else
+ U(5, 0x1C); // constant shortcut:
+// SE(0); // pic_init_qs_minus26 1
+// SE(0); // chroma_qp_index_offset 1
+// U1(1); // deblocking_filter_control_present_flag 1
+// U1(0); // constrained_intra_pred_flag 0
+// U1(0); // redundant_pic_cnt_present_flag 0
+#endif
+ nal_end(enc);
+}
+
+/**
+* Encode Slice Header
+* ref: [1] 7.3.3
+*/
+static void encode_slice_header(h264e_enc_t *enc, int frame_type, int long_term_idx_use, int long_term_idx_update, int pps_id, int enc_type)
+{
+ // slice reset
+ enc->slice.start_mb_num = enc->mb.num;
+ enc->mb.skip_run = 0;
+ memset(enc->i4x4mode, -1, (enc->frame.nmbx + 1)*4);
+ memset(enc->nnz, NNZ_NA, (enc->frame.nmbx + 1)*8); // DF ignore slice borders, but uses it's own nnz's
+
+ if (enc_type == 0)
+ {
+#if H264E_SVC_API
+ if (enc->param.num_layers > 1)
+ {
+ //need prefix nal for compatibility base layer with h264
+ nal_start(enc, 14 | 0x40);
+ //if((nal_unit_type == NAL_UNIT_TYPE_PREFIX_SCALABLE_EXT ) ||nal_unit_type == NAL_UNIT_TYPE_RBSP_SCALABLE_EXT))
+ {
+ //reserved_one_bit = 1 idr_flag priority_id
+ U(8, (1 << 7) | ((frame_type == H264E_FRAME_TYPE_KEY) << 6) | 0);
+ U1(1); //no_inter_layer_pred_flag
+ U(3, 0); //dependency_id
+ U(4, quality_id); //quality_id
+ //reserved_three_2bits = 3!
+ U(3, 0); //temporal_id
+ U1(1); //use_ref_base_pic_flag
+ U1(0); //discardable_flag
+ U1(1); //output_flag
+ U(2, 3);
+
+ U1(0); //store_ref_base_pic_flag
+ if (!(frame_type == H264E_FRAME_TYPE_KEY))
+ {
+ U1(0); //adaptive_ref_base_pic_marking_mode_flag u(1)
+ }
+
+ U1(0); //prefix_nal_unit_additional_extension_flag 2 u(1)
+
+ //put_bits_rbsp_trailing( s );
+ }
+ nal_end(enc);
+ }
+#endif //#if H264E_SVC_API
+ nal_start(enc, (frame_type == H264E_FRAME_TYPE_KEY ? 5 : 1) | (long_term_idx_update >= 0 ? 0x60 : 0));
+ }
+#if H264E_SVC_API
+ else
+ {
+ nal_start(enc, (20 | (long_term_idx_update >= 0 ? 0x60 : 0))); //RBSP_SCALABLE_EXT = 20
+ //nal_unit_type 20 or 14
+ {
+ //reserved_one_bit = 1 idr_flag priority_id
+ U(8, (1 << 7) | ((frame_type == H264E_FRAME_TYPE_KEY) << 6) | 0);
+ U1(!enc->param.inter_layer_pred_flag); //no_inter_layer_pred_flag
+ U(3, dependency_id); //dependency_id
+ U(4, quality_id); //quality_id
+ //reserved_three_2bits = 3!!!
+ U(3, 0); //temporal_id
+ U1(0); //use_ref_base_pic_flag
+ U1(1); //discardable_flag
+ U1(1); //output_flag
+ U(2, 3);
+ }
+ }
+#endif
+
+ UE(enc->slice.start_mb_num); // first_mb_in_slice
+ UE(enc->slice.type); // slice_type
+ //U(1+4, 16 + (enc->frame.num&15)); // pic_parameter_set_id | frame_num
+ UE(pps_id); // pic_parameter_set_id
+ U(4 + log2_max_frame_num_minus4, enc->frame.num & ((1 << (log2_max_frame_num_minus4 + 4)) - 1)); // frame_num U(4, enc->frame.num&15); // frame_num
+ if (frame_type == H264E_FRAME_TYPE_KEY)
+ {
+ UE(enc->next_idr_pic_id); // idr_pic_id
+ }
+ //!!! if !quality_id && enc->slice.type == SLICE_TYPE_P put_bit(s, 0); // num_ref_idx_active_override_flag = 0
+ if(!quality_id)
+ {
+ if (((enc_type != 0)) && enc->slice.type == SLICE_TYPE_P)
+ {
+ //U1(0);
+ }
+ if (enc->slice.type == SLICE_TYPE_P)// if( slice_type == P | | slice_type == SP | | slice_type = = B )
+ {
+ int ref_pic_list_modification_flag_l0 = long_term_idx_use > 0;
+ //U1(0); // num_ref_idx_active_override_flag
+ // ref_pic_list_modification()
+ U(2, ref_pic_list_modification_flag_l0); // num_ref_idx_active_override_flag | ref_pic_list_modification_flag_l0
+ if (ref_pic_list_modification_flag_l0)
+ {
+ // Table 7-7
+ UE(2); // long_term_pic_num is present and specifies the long-term picture number for a reference picture
+ UE(long_term_idx_use - 1); // long_term_pic_num
+ UE(3); // End loop
+ }
+ }
+
+ if (long_term_idx_update >= 0)
+ {
+ //dec_ref_pic_marking( )
+ if (frame_type == H264E_FRAME_TYPE_KEY)
+ {
+ //U1(0); // no_output_of_prior_pics_flag
+ //U1(enc->param.enable_golden_frames_flag); // long_term_reference_flag
+ U(2, enc->param.max_long_term_reference_frames > 0); // no_output_of_prior_pics_flag | long_term_reference_flag
+ } else
+ {
+ int adaptive_ref_pic_marking_mode_flag = long_term_idx_update > 0;//(frame_type == H264E_FRAME_TYPE_GOLDEN);
+ U1(adaptive_ref_pic_marking_mode_flag);
+ if (adaptive_ref_pic_marking_mode_flag)
+ {
+ // Table 7-9
+ if (enc->short_term_used)
+ {
+ UE(1); // unmark short
+ UE(0); // unmark short
+ }
+ if (enc->lt_used[long_term_idx_update - 1])
+ {
+ UE(2); // Mark a long-term reference picture as "unused for reference"
+ UE(long_term_idx_update - 1); // index
+ } else
+ {
+ UE(4); // Specify the maximum long-term frame index
+ UE(enc->param.max_long_term_reference_frames); // [0,max-1]+1
+ }
+ UE(6); // Mark the current picture as "used for long-term reference"
+ UE(long_term_idx_update - 1); // index
+ UE(0); // End loop
+ }
+ }
+ }
+ }
+ SE(enc->rc.prev_qp - enc->sps.pic_init_qp); // slice_qp_delta
+#if H264E_MAX_THREADS
+ if (enc->param.max_threads > 1)
+ {
+ UE(enc->speed.disable_deblock ? 1 : 2);
+ } else
+#endif
+ {
+ UE(enc->speed.disable_deblock); // disable deblock
+ }
+
+ if (enc->speed.disable_deblock != 1)
+ {
+#if ALPHA_OFS || BETA_OFS
+ SE(ALPHA_OFS/2); // slice_alpha_c0_offset_div2
+ SE(BETA_OFS/2); // slice_beta_offset_div2
+#else
+ U(2, 3);
+#endif
+ }
+
+#if H264E_SVC_API
+ if (enc_type != 0)
+ {
+ enc->adaptive_base_mode_flag = enc->param.inter_layer_pred_flag;
+ if (enc->param.inter_layer_pred_flag && !quality_id)
+ {
+ UE(16*(dependency_id - 1));
+ //if(1)//(inter_layer_deblocking_filter_control_present_flag)
+ {
+ UE(0);//disable_inter_layer_deblocking_filter_idc
+ UE(0);
+ UE(0);
+ }
+ /*if( sh->disable_inter_layer_deblocking_filter_idc != 1 )
+ {
+ put_bits_se(s, sh->slice_alpha_c0_offset_div2);
+ put_bits_se(s, sh->slice_beta_offset_div2);
+ }*/
+ U1(0); // constrained_intra_resampling_flag 2 u(1)
+ }
+ if (enc->param.inter_layer_pred_flag)
+ {
+ U1(0); //slice_skip_flag u(1)
+ {
+ U1(enc->adaptive_base_mode_flag); // 2 u(1)
+ if (!enc->adaptive_base_mode_flag)
+ U1(default_base_mode_flag); // 2 u(1)
+ if (!default_base_mode_flag)
+ {
+ U1(0); //adaptive_motion_prediction_flag) // 2 u(1)
+ U1(0); //sh->default_motion_prediction_flag// 2 u(1)
+ }
+ U1(0); //adaptive_residual_prediction_flag // 2 u(1)
+ U1(0); //default_residual_prediction_flag // 2 u(1)
+ }
+ }
+ }
+#endif // #if H264E_SVC_API
+}
+
+/**
+* Macroblock transform, quantization and bitstream encoding
+*/
+static void mb_write(h264e_enc_t *enc, int enc_type, int base_mode)
+{
+ int i, uv, mb_type, cbpc, cbpl, cbp;
+ scratch_t *qv = enc->scratch;
+ //int base_mode = enc_type > 0 ? 1 : 0;
+ int mb_type_svc = base_mode ? -2 : enc->mb.type;
+ int intra16x16_flag = mb_type_svc >= 6;// && !base_mode;
+ uint8_t nz[9];
+ uint8_t *nnz_top = enc->nnz + 8 + enc->mb.x*8;
+ uint8_t *nnz_left = enc->nnz;
+
+ if (enc->mb.type != 5)
+ {
+ enc->i4x4mode[0] = enc->i4x4mode[enc->mb.x + 1] = 0x02020202;
+ }
+
+ enc->df.nzflag = ((enc->df.nzflag >> 4) & 0x84210) | enc->df.df_nzflag[enc->mb.x];
+ for (i = 0; i < 4; i++)
+ {
+ nz[5 + i] = nnz_top[i];
+ nnz_top[i] = 0;
+ nz[3 - i] = nnz_left[i];
+ nnz_left[i] = 0;
+ }
+
+l_skip:
+ if (enc->mb.type == -1)
+ {
+ // encode skip macroblock
+ assert(enc->slice.type != SLICE_TYPE_I);
+
+ // Increment run count
+ enc->mb.skip_run++;
+
+ // Update predictors
+ *(uint32_t*)(nnz_top + 4) = *(uint32_t*)(nnz_left + 4) = 0; // set chroma NNZ to 0
+ me_mv_medianpredictor_put(enc, 0, 0, 4, 4, enc->mb.mv[0]);
+ me_mv_dfmatrix_put(enc->df.df_mv, 0, 0, 4, 4, enc->mb.mv[0]);
+
+ // Update reference with reconstructed pixels
+ h264e_copy_16x16(enc->dec.yuv[0], enc->dec.stride[0], enc->pbest, 16);
+ h264e_copy_8x8(enc->dec.yuv[1], enc->dec.stride[1], enc->ptest);
+ h264e_copy_8x8(enc->dec.yuv[2], enc->dec.stride[2], enc->ptest + 8);
+ } else
+ {
+ if (enc->mb.type != 5)
+ {
+ unsigned nz_mask;
+ nz_mask = h264e_transform_sub_quant_dequant(qv->mb_pix_inp, enc->pbest, 16, intra16x16_flag ? QDQ_MODE_INTRA_16 : QDQ_MODE_INTER, qv->qy, enc->rc.qdat[0]);
+ enc->scratch->nz_mask = (uint16_t)nz_mask;
+ if (intra16x16_flag)
+ {
+ h264e_quant_luma_dc(qv->qy, qv->quant_dc, enc->rc.qdat[0]);
+ nz_mask = 0xFFFF;
+ }
+ h264e_transform_add(enc->dec.yuv[0], enc->dec.stride[0], enc->pbest, qv->qy, 4, nz_mask << 16);
+ }
+
+ // Coded Block Pattern for luma
+ cbpl = 0;
+ if (enc->scratch->nz_mask & 0xCC00) cbpl |= 1;
+ if (enc->scratch->nz_mask & 0x3300) cbpl |= 2;
+ if (enc->scratch->nz_mask & 0x00CC) cbpl |= 4;
+ if (enc->scratch->nz_mask & 0x0033) cbpl |= 8;
+
+ // Coded Block Pattern for chroma
+ cbpc = 0;
+ for (uv = 1; uv < 3; uv++)
+ {
+ pix_t *pred = enc->ptest + (uv - 1)*8;
+ pix_t *pix_mb_uv = mb_input_chroma(enc, uv);
+ int dc_flag, inp_stride = enc->inp.stride[uv];
+ unsigned nz_mask;
+ quant_t *pquv = (uv == 1) ? qv->qu : qv->qv;
+
+ if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16 > enc->param.width || (enc->mb.y + 1)*16 > enc->param.height))
+ {
+ pix_copy_cropped_mb(enc->scratch->mb_pix_inp, 8, pix_mb_uv, enc->inp.stride[uv],
+ MIN(8, enc->param.width/2 - enc->mb.x*8),
+ MIN(8, enc->param.height/2 - enc->mb.y*8)
+ );
+ pix_mb_uv = enc->scratch->mb_pix_inp;
+ inp_stride = 8;
+ }
+
+ nz_mask = h264e_transform_sub_quant_dequant(pix_mb_uv, pred, inp_stride, QDQ_MODE_CHROMA, pquv, enc->rc.qdat[1]);
+
+ if (nz_mask)
+ {
+ cbpc = 2;
+ }
+
+ cbpc |= dc_flag = h264e_quant_chroma_dc(pquv, uv == 1 ? qv->quant_dc_u : qv->quant_dc_v, enc->rc.qdat[1]);
+
+ if (!(dc_flag | nz_mask))
+ {
+ h264e_copy_8x8(enc->dec.yuv[uv], enc->dec.stride[uv], pred);
+ } else
+ {
+ if (dc_flag)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ if (~nz_mask & (8 >> i))
+ {
+ memset(pquv[i].dq + 1, 0, (16 - 1)*sizeof(int16_t));
+ }
+ }
+ nz_mask = 15;
+ }
+ h264e_transform_add(enc->dec.yuv[uv], enc->dec.stride[uv], pred, pquv, 2, nz_mask << 28);
+ }
+ }
+ cbpc = MIN(cbpc, 2);
+
+ // Rollback to skip
+ if (!(enc->mb.type | cbpl | cbpc) && // Inter prediction, all-zero after quantization
+ mv_equal(enc->mb.mv[0], enc->mb.mv_skip_pred)) // MV == MV preditor for skip
+ {
+ enc->mb.type = -1;
+ goto l_skip;
+ }
+
+ mb_type = enc->mb.type;
+ if (mb_type_svc >= 6) // intra 16x16
+ {
+ if (cbpl)
+ {
+ cbpl = 15;
+ }
+ mb_type += enc->mb.i16.pred_mode_luma + cbpc*4 + (cbpl ? 12 : 0);
+ }
+ if (mb_type >= 5 && enc->slice.type == SLICE_TYPE_I) // Intra in I slice
+ {
+ mb_type -= 5;
+ }
+
+ if (enc->slice.type != SLICE_TYPE_I)
+ {
+ UE(enc->mb.skip_run);
+ enc->mb.skip_run = 0;
+ }
+
+ (void)enc_type;
+#if H264E_SVC_API
+ if (enc->adaptive_base_mode_flag && enc_type > 0)
+ U1(base_mode);
+#endif
+
+ if (!base_mode)
+ UE(mb_type);
+
+ if (enc->mb.type == 3) // 8x8
+ {
+ for (i = 0; i < 4; i++)
+ {
+ UE(0);
+ }
+ // 0 = 8x8
+ // 1 = 8x4
+ // 2 = 4x8
+ // 3 = 4x4
+ }
+
+ if (!base_mode)
+ {
+ if (enc->mb.type >= 5) // intra
+ {
+ int pred_mode_chroma;
+ if (enc->mb.type == 5) // intra 4x4
+ {
+ for (i = 0; i < 16; i++)
+ {
+ int m = enc->mb.i4x4_mode[decode_block_scan[i]];
+ int nbits = 4;
+ if (m < 0)
+ {
+ m = nbits = 1;
+ }
+ U(nbits, m);
+ }
+ }
+ pred_mode_chroma = enc->mb.i16.pred_mode_luma;
+ if (!(pred_mode_chroma&1))
+ {
+ pred_mode_chroma ^= 2;
+ }
+ UE(pred_mode_chroma);
+ me_mv_medianpredictor_put(enc, 0, 0, 4, 4, point(MV_NA,0));
+ } else
+ {
+ int part, x = 0, y = 0;
+ int dx = (enc->mb.type & 2) ? 2 : 4;
+ int dy = (enc->mb.type & 1) ? 2 : 4;
+ for (part = 0;;part++)
+ {
+ SE(enc->mb.mvd[part].s.x);
+ SE(enc->mb.mvd[part].s.y);
+ me_mv_medianpredictor_put(enc, x, y, dx, dy, enc->mb.mv[part]);
+ me_mv_dfmatrix_put(enc->df.df_mv, x, y, dx, dy, enc->mb.mv[part]);
+ x = (x + dx) & 3;
+ if (!x)
+ {
+ y = (y + dy) & 3;
+ if (!y)
+ {
+ break;
+ }
+ }
+ }
+ }
+ }
+ cbp = cbpl + (cbpc << 4);
+ /*temp for test up-sample filter*/
+ /*if(base_mode)
+ {
+ cbp = 0;
+ cbpl=0;
+ cbpc = 0;
+ }*/
+ if (mb_type_svc < 6)
+ {
+ // encode cbp 9.1.2 Mapping process for coded block pattern
+ static const uint8_t cbp2code[2][48] = {
+ {3, 29, 30, 17, 31, 18, 37, 8, 32, 38, 19, 9, 20, 10, 11, 2, 16, 33, 34, 21, 35, 22, 39, 4,
+ 36, 40, 23, 5, 24, 6, 7, 1, 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0},
+ {0, 2, 3, 7, 4, 8, 17, 13, 5, 18, 9, 14, 10, 15, 16, 11, 1, 32, 33, 36, 34, 37, 44, 40,
+ 35, 45, 38, 41, 39, 42, 43, 19, 6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12}
+ };
+ UE(cbp2code[mb_type_svc < 5][cbp]);
+ }
+
+ if (cbp || (mb_type_svc >= 6))
+ {
+ SE(enc->rc.qp - enc->rc.prev_qp);
+ enc->rc.prev_qp = enc->rc.qp;
+ }
+
+ // *** Huffman encoding ***
+
+ // 1. Encode Luma DC (intra 16x16 only)
+ if (intra16x16_flag)
+ {
+ h264e_vlc_encode(enc->bs, qv->quant_dc, 16, nz + 4);
+ }
+
+ // 2. Encode luma residual (only if CBP non-zero)
+ if (cbpl)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ int j = decode_block_scan[i];
+ if (cbp & (1 << (i >> 2)))
+ {
+ uint8_t *pnz = nz + 4 + (j & 3) - (j >> 2);
+ h264e_vlc_encode(enc->bs, qv->qy[j].qv, 16 - intra16x16_flag, pnz);
+ if (*pnz)
+ {
+ enc->df.nzflag |= 1 << (5 + (j & 3) + 5*(j >> 2));
+ }
+ } else
+ {
+ nz[4 + (j & 3) - (j >> 2)] = 0;
+ }
+ }
+ for (i = 0; i < 4; i++)
+ {
+ nnz_top[i] = nz[1 + i];
+ nnz_left[i] = nz[7 - i];
+ }
+ }
+
+ // 2. Encode chroma
+ if (cbpc)
+ {
+ uint8_t nzcdc[3];
+ nzcdc[0] = nzcdc[2] = 17; // dummy neighbors, indicating chroma DC
+ // 2.1. Encode chroma DC
+ for (uv = 1; uv < 3; uv++)
+ {
+ h264e_vlc_encode(enc->bs, uv == 1 ? qv->quant_dc_u : qv->quant_dc_v, 4, nzcdc + 1);
+ }
+
+ // 2.2. Encode chroma residual
+ if (cbpc > 1)
+ {
+ for (uv = 1; uv < 3; uv++)
+ {
+ uint8_t nzc[5];
+ int nnz_off = (uv == 1 ? 4 : 6);
+ quant_t *pquv = uv == 1 ? qv->qu : qv->qv;
+ for (i = 0; i < 2; i++)
+ {
+ nzc[3 + i] = nnz_top[nnz_off + i] ;
+ nzc[1 - i] = nnz_left[nnz_off + i];
+ }
+ for (i = 0; i < 4; i++)
+ {
+ int k = 2 + (i & 1) - (i >> 1);
+ h264e_vlc_encode(enc->bs, pquv[i].qv, 15, nzc + k);
+ }
+ for (i = 0; i < 2; i++)
+ {
+ nnz_top[nnz_off + i] = nzc[1 + i];
+ nnz_left[nnz_off + i] = nzc[3 - i];
+ }
+ }
+ }
+ }
+ if (cbpc !=2)
+ {
+ *(uint32_t*)(nnz_top+4) = *(uint32_t*)(nnz_left+4) = 0; // set chroma NNZ to 0
+ }
+ }
+
+ // Save top & left lines
+ for (uv = 0; uv < 3; uv++)
+ {
+ int off = 0, n = uv ? 8 : 16;
+ pix_t *top = enc->top_line + 48 + enc->mb.x*32;
+ pix_t *left = enc->top_line;
+ pix_t *mb = enc->dec.yuv[uv];
+
+ if (uv)
+ {
+ off = 8 + uv*8;
+ }
+ top += off;
+ left += off;
+
+ enc->top_line[32 + uv] = top[n - 1];
+ for (i = 0; i < n; i++)
+ {
+ left[i] = mb[n - 1 + i*enc->dec.stride[uv]];
+ top[i] = mb[(n - 1)*enc->dec.stride[uv] + i];
+ }
+ }
+}
+
+/************************************************************************/
+/* Intra mode encoding */
+/************************************************************************/
+/**
+* Estimate cost of 4x4 intra predictor
+*/
+static void intra_choose_4x4(h264e_enc_t *enc)
+{
+ int i, n, a, nz_mask = 0, avail = mb_avail_flag(enc);
+ scratch_t *qv = enc->scratch;
+ pix_t *mb_dec = enc->dec.yuv[0];
+ pix_t *dec = enc->ptest;
+ int cost = g_lambda_i4_q4[enc->rc.qp];// + MUL_LAMBDA(16, g_lambda_q4[enc->rc.qp]); // 4x4 cost: at least 16 bits + penalty
+
+ uint32_t edge_store[(3 + 16 + 1 + 16 + 4)/4 + 2]; // pad for SSE
+ pix_t *edge = ((pix_t*)edge_store) + 3 + 16 + 1;
+ uint32_t *edge32 = (uint32_t *)edge; // alias
+ const uint32_t *top32 = (const uint32_t*)(enc->top_line + 48 + enc->mb.x*32);
+ pix_t *left = enc->top_line;
+
+ edge[-1] = enc->top_line[32];
+ for (i = 0; i < 16; i++)
+ {
+ edge[-2 - i] = left[i];
+ }
+ for (i = 0; i < 4; i++)
+ {
+ edge32[i] = top32[i];
+ }
+ edge32[4] = top32[8];
+
+ for (n = 0; n < 16; n++)
+ {
+ static const uint8_t block2avail[16] = {
+ 0x07, 0x23, 0x23, 0x2b, 0x9b, 0x77, 0xff, 0x77, 0x9b, 0xff, 0xff, 0x77, 0x9b, 0x77, 0xff, 0x77,
+ };
+ pix_t *block;
+ pix_t *blockin;
+ int sad, mpred, mode;
+ int r = n >> 2;
+ int c = n & 3;
+ int8_t *ctx_l = (int8_t *)enc->i4x4mode + r;
+ int8_t *ctx_t = (int8_t *)enc->i4x4mode + 4 + enc->mb.x*4 + c;
+ edge = ((pix_t*)edge_store) + 3 + 16 + 1 + 4*c - 4*r;
+
+ a = avail;
+ a &= block2avail[n];
+ a |= block2avail[n] >> 4;
+
+ if (!(block2avail[n] & AVAIL_TL)) // TL replace
+ {
+ if ((n <= 3 && (avail & AVAIL_T)) ||
+ (n > 3 && (avail & AVAIL_L)))
+ {
+ a |= AVAIL_TL;
+ }
+ }
+ if (n < 3 && (avail & AVAIL_T))
+ {
+ a |= AVAIL_TR;
+ }
+
+ blockin = enc->scratch->mb_pix_inp + (c + r*16)*4;
+ block = dec + (c + r*16)*4;
+
+ mpred = MIN(*ctx_l, *ctx_t);
+ if (mpred < 0)
+ {
+ mpred = 2;
+ }
+
+ sad = h264e_intra_choose_4x4(blockin, block, a, edge, mpred, MUL_LAMBDA(3, g_lambda_q4[enc->rc.qp]));
+ mode = sad & 15;
+ sad >>= 4;
+
+ *ctx_l = *ctx_t = (int8_t)mode;
+ if (mode == mpred)
+ {
+ mode = -1;
+ } else if (mode > mpred)
+ {
+ mode--;
+ }
+ enc->mb.i4x4_mode[n] = (int8_t)mode;
+
+ nz_mask <<= 1;
+ if (sad > g_skip_thr_i4x4[enc->rc.qp])
+ {
+ // skip transform on low SAD gains just about 2% for all-intra coding at QP40,
+ // for other QP gain is minimal, so SAD check do not used
+ nz_mask |= h264e_transform_sub_quant_dequant(blockin, block, 16, QDQ_MODE_INTRA_4, qv->qy + n, enc->rc.qdat[0]);
+
+ if (nz_mask & 1)
+ {
+ h264e_transform_add(block, 16, block, qv->qy + n, 1, ~0);
+ }
+ } else
+ {
+ memset((qv->qy+n), 0, sizeof(qv->qy[0]));
+ }
+
+ cost += sad;
+
+ edge[2] = block[3];
+ edge[1] = block[3 + 16];
+ edge[0] = block[3 + 16*2];
+ *(uint32_t*)&edge[-4] = *(uint32_t*)&block[16*3];
+ }
+ enc->scratch->nz_mask = (uint16_t)nz_mask;
+
+ if (cost < enc->mb.cost)
+ {
+ enc->mb.cost = cost;
+ enc->mb.type = 5; // intra 4x4
+ h264e_copy_16x16(mb_dec, enc->dec.stride[0], dec, 16); // restore reference
+ }
+}
+
+/**
+* Choose 16x16 prediction mode, most suitable for given gradient
+*/
+static int intra_estimate_16x16(pix_t *p, int s, int avail, int qp)
+{
+ static const uint8_t mode_i16x16_valid[8] = { 4, 5, 6, 7, 4, 5, 6, 15 };
+ int p00 = p[0];
+ int p01 = p[15];
+ int p10 = p[15*s + 0];
+ int p11 = p[15*s + 15];
+ int v = mode_i16x16_valid[avail & (AVAIL_T + AVAIL_L + AVAIL_TL)];
+ // better than above on low bitrates
+ int dx = ABS(p00 - p01) + ABS(p10 - p11) + ABS((int)p[8*s] - (int)p[8*s + 15]);
+ int dy = ABS(p00 - p10) + ABS(p01 - p11) + ABS((int)p[8] - (int)p[15*s + 8]);
+
+ if ((dx > 30 + 3*dy && dy < (100 + 50 - qp)
+ //|| (/*dx < 50 &&*/ dy <= 12)
+ ) && (v & 1))
+ return 0;
+ else if (dy > 30 + 3*dx && dx < (100 + 50 - qp) && (v & (1 << 1)))
+ return 1;
+ else
+ return 2;
+}
+
+/**
+* Estimate cost of 16x16 intra predictor
+*
+* for foreman@qp10
+*
+* 12928 - [0-3], [0]
+* 12963 - [0-2], [0]
+* 12868 - [0-2], [0-3]
+* 12878 - [0-2], [0-2]
+* 12834 - [0-3], [0-3]
+*sad
+* 13182
+*heuristic
+* 13063
+*
+*/
+static void intra_choose_16x16(h264e_enc_t *enc, pix_t *left, pix_t *top, int avail)
+{
+ int sad, sad4[4];
+ // heuristic mode decision
+ enc->mb.i16.pred_mode_luma = intra_estimate_16x16(enc->scratch->mb_pix_inp, 16, avail, enc->rc.qp);
+
+ // run chosen predictor
+ h264e_intra_predict_16x16(enc->ptest, left, top, enc->mb.i16.pred_mode_luma);
+
+ // coding cost
+ sad = h264e_sad_mb_unlaign_8x8(enc->scratch->mb_pix_inp, 16, enc->ptest, sad4) // SAD
+ + MUL_LAMBDA(bitsize_ue(enc->mb.i16.pred_mode_luma + 1), g_lambda_q4[enc->rc.qp]) // side-info penalty
+ + g_lambda_i16_q4[enc->rc.qp]; // block kind penalty
+
+ if (sad < enc->mb.cost)
+ {
+ enc->mb.cost = sad;
+ enc->mb.type = 6;
+ SWAP(pix_t*, enc->pbest, enc->ptest);
+ }
+}
+
+/************************************************************************/
+/* Inter mode encoding */
+/************************************************************************/
+
+/**
+* Sub-pel luma interpolation
+*/
+static void interpolate_luma(const pix_t *ref, int stride, point_t mv, point_t wh, pix_t *dst)
+{
+ ref += (mv.s.y >> 2) * stride + (mv.s.x >> 2);
+ mv.u32 &= 0x000030003;
+ h264e_qpel_interpolate_luma(ref, stride, dst, wh, mv);
+}
+
+/**
+* Sub-pel chroma interpolation
+*/
+static void interpolate_chroma(h264e_enc_t *enc, point_t mv)
+{
+ int i;
+ for (i = 1; i < 3; i++)
+ {
+ point_t wh;
+ int part = 0, x = 0, y = 0;
+ wh.s.x = (enc->mb.type & 2) ? 4 : 8;
+ wh.s.y = (enc->mb.type & 1) ? 4 : 8;
+ if (enc->mb.type == -1) // skip
+ {
+ wh.s.x = wh.s.y = 8;
+ }
+
+ for (;;part++)
+ {
+ pix_t *ref;
+ mv = mb_abs_mv(enc, enc->mb.mv[part]);
+ ref = enc->ref.yuv[i] + ((mv.s.y >> 3) + y)*enc->ref.stride[i] + (mv.s.x >> 3) + x;
+ mv.u32 &= 0x00070007;
+ h264e_qpel_interpolate_chroma(ref, enc->ref.stride[i], enc->ptest + (i - 1)*8 + 16*y + x, wh, mv);
+ x = (x + wh.s.x) & 7;
+ if (!x)
+ {
+ y = (y + wh.s.y) & 7;
+ if (!y)
+ {
+ break;
+ }
+ }
+ }
+ }
+}
+
+/**
+* RD cost of given MV
+*/
+static int me_mv_cost(point_t mv, point_t mv_pred, int qp)
+{
+ int nb = bits_se(mv.s.x - mv_pred.s.x) + bits_se(mv.s.y - mv_pred.s.y);
+ return MUL_LAMBDA(nb, g_lambda_mv_q4[qp]);
+}
+
+/**
+* RD cost of given MV candidate (TODO)
+*/
+#define me_mv_cand_cost me_mv_cost
+//static int me_mv_cand_cost(point_t mv, point_t mv_pred, int qp)
+//{
+// int nb = bits_se(mv.s.x - mv_pred.s.x) + bits_se(mv.s.y - mv_pred.s.y);
+// return MUL_LAMBDA(nb, g_lambda_mv_q4[qp]);
+//}
+
+
+/**
+* Modified full-pel motion search with small diamond algorithm
+* note: diamond implemented with small modifications, trading speed for precision
+*/
+static int me_search_diamond(h264e_enc_t *enc, const pix_t *ref, const pix_t *b, int rowbytes, point_t *mv,
+ const rectangle_t *range, int qp, point_t mv_pred, int min_sad, point_t wh, pix_t *scratch, pix_t **ppbest, int store_bytes)
+{
+ // cache map cache moves
+ // 3 0 x->1
+ // * 1 x->0
+ // 1 * x * 0 2 x->3
+ // * 3 x->2
+ // 2 ^1
+
+ // cache double moves:
+ // prev prev
+ // x -> 0 -> 3 ==> 3 => 1
+ // x -> 0 -> 2 ==> 2 => 1
+ // x -> 0 -> 0 ==> 0 => 1
+ // x -> 0 -> 1 - impossible
+ // prev SAD(n) is (n+4)
+ //
+
+ static const point_t dir2mv[] = {{{4, 0}},{{-4, 0}},{{0, 4}},{{0, -4}}};
+ union
+ {
+ uint16_t cache[8];
+ uint32_t cache32[4];
+ } sad;
+
+ int dir, cloop, dir_prev, cost;
+ point_t v;
+
+ assert(mv_in_rect(*mv, range));
+
+restart:
+ dir = 0; // start gradient descend with direction dir2mv[0]
+ cloop = 4; // try 4 directions
+ dir_prev = -1; // not yet moved
+
+ // reset SAD cache
+ sad.cache32[0] = sad.cache32[1] = sad.cache32[2] = sad.cache32[3] = ~0u;
+
+ // 1. Full-pel ME with small diamond modification:
+ // center point moved immediately as soon as new minimum found
+ do
+ {
+ assert(dir >= 0 && dir < 4);
+
+ // Try next point. Avoid out-of-range moves
+ v = mv_add(*mv, dir2mv[dir]);
+ //if (mv_in_rect(v, range) && sad.cache[dir] == (uint16_t)~0u)
+ if (mv_in_rect(v, range) && sad.cache[dir] == 0xffffu)
+ {
+ cost = h264e_sad_mb_unlaign_wh(ref + ((v.s.y*rowbytes + v.s.x) >> 2), rowbytes, b, wh);
+ //cost += me_mv_cost(*mv, mv_pred, qp);
+ cost += me_mv_cost(v, mv_pred, qp);
+ sad.cache[dir] = (uint16_t)cost;
+ if (cost < min_sad)
+ {
+ // This point is better than center: move this point to center and continue
+ int corner = ~0;
+ if (dir_prev >= 0) // have previous move
+ { // save cache point, which can be used in next iteration
+ corner = sad.cache[4 + dir]; // see "cache double moves" above
+ }
+ sad.cache32[2] = sad.cache32[0]; // save current cache to 'previous'
+ sad.cache32[3] = sad.cache32[1];
+ sad.cache32[0] = sad.cache32[1] = ~0u; // reset current cache
+ if (dir_prev >= 0) // but if have previous move
+ { // one cache point can be reused from previous iteration
+ sad.cache[dir_prev^1] = (uint16_t)corner; // see "cache double moves" above
+ }
+ sad.cache[dir^1] = (uint16_t)min_sad; // previous center become a neighbor's
+ dir_prev = dir; // save this direction
+ dir--; // start next iteration with the same direction
+ cloop = 4 + 1; // and try 4 directions (+1 for do-while loop)
+ *mv = v; // Save best point found
+ min_sad = cost; // and it's SAD
+ }
+ }
+ dir = (dir + 1) & 3; // cycle search directions
+ } while(--cloop);
+
+ // 2. Optional: Try diagonal step
+ //if (1)
+ {
+ int primary_dir = sad.cache[3] >= sad.cache[2] ? 2 : 3;
+ int secondary_dir = sad.cache[1] >= sad.cache[0] ? 0 : 1;
+ if (sad.cache[primary_dir] < sad.cache[secondary_dir])
+ {
+ SWAP(int, secondary_dir, primary_dir);
+ }
+
+ v = mv_add(dir2mv[secondary_dir], dir2mv[primary_dir]);
+ v = mv_add(*mv, v);
+ //cost = (uint16_t)~0u;
+ if (mv_in_rect(v, range))
+ {
+ cost = h264e_sad_mb_unlaign_wh(ref + ((v.s.y*rowbytes + v.s.x) >> 2), rowbytes, b, wh);
+ cost += me_mv_cost(v, mv_pred, qp);
+ if (cost < min_sad)
+ {
+ *mv = v;//mv_add(*mv, v);
+ min_sad = cost;
+ goto restart;
+ }
+ }
+ }
+
+ interpolate_luma(ref, rowbytes, *mv, wh, scratch); // Plain NxM copy can be used
+ *ppbest = scratch;
+
+ // 3. Fractional pel search
+ if (enc->run_param.encode_speed < 9 && mv_in_rect(*mv, &enc->frame.mv_qpel_limit))
+ {
+ point_t vbest = *mv;
+ pix_t *pbest = scratch;
+ pix_t *hpel = scratch + store_bytes;
+ pix_t *hpel1 = scratch + ((store_bytes == 8) ? 256 : 2*store_bytes);
+ pix_t *hpel2 = hpel1 + store_bytes;
+
+ int i, sad_test;
+ point_t primary_qpel, secondary_qpel, vdiag;
+
+ unsigned minsad1 = sad.cache[1];
+ unsigned minsad2 = sad.cache[3];
+ secondary_qpel = point(-1, 0);
+ primary_qpel = point(0, -1);
+ if (sad.cache[3] >= sad.cache[2])
+ primary_qpel = point(0, 1), minsad2 = sad.cache[2];
+ if (sad.cache[1] >= sad.cache[0])
+ secondary_qpel = point(1, 0), minsad1 = sad.cache[0];
+
+ if (minsad2 > minsad1)
+ {
+ SWAP(point_t, secondary_qpel, primary_qpel);
+ }
+
+ // ============> primary
+ // |00 01 02
+ // |10 11 12
+ // |20 22
+ // V
+ // secondary
+ vdiag = mv_add(primary_qpel, secondary_qpel);
+
+ for (i = 0; i < 7; i++)
+ {
+ pix_t *ptest;
+ switch(i)
+ {
+ case 0:
+ // 02 = interpolate primary half-pel
+ v = mv_add(*mv, mv_add(primary_qpel, primary_qpel));
+ interpolate_luma(ref, rowbytes, v, wh, ptest = hpel1);
+ break;
+ case 1:
+ // 01 q-pel = (00 + 02)/2
+ v = mv_add(*mv, primary_qpel);
+ h264e_qpel_average_wh_align(scratch, hpel1, ptest = hpel, wh);
+ break;
+ case 2:
+ // 20 = interpolate secondary half-pel
+ v = mv_add(*mv, mv_add(secondary_qpel, secondary_qpel));
+ interpolate_luma(ref, rowbytes, v, wh, ptest = hpel2);
+ break;
+ case 3:
+ // 10 q-pel = (00 + 20)/2
+ hpel = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+ v = mv_add(*mv, secondary_qpel);
+ h264e_qpel_average_wh_align(scratch, hpel2, ptest = hpel, wh);
+ break;
+ case 4:
+ // 11 q-pel = (02 + 20)/2
+ hpel = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+ v = mv_add(*mv, vdiag);
+ h264e_qpel_average_wh_align(hpel1, hpel2, ptest = hpel, wh);
+ break;
+ case 5:
+ // 22 = interpolate center half-pel
+ if (pbest == hpel2) hpel2 = scratch, hpel = scratch + store_bytes;
+ v = mv_add(*mv, mv_add(vdiag, vdiag));
+ interpolate_luma(ref, rowbytes, v, wh, ptest = hpel2);
+ break;
+ case 6:
+ default:
+ // 12 q-pel = (02 + 22)/2
+ hpel = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+ v = mv_add(*mv, mv_add(primary_qpel, vdiag));
+ h264e_qpel_average_wh_align(hpel2, hpel1, ptest = hpel, wh);
+ break;
+ }
+
+ sad_test = h264e_sad_mb_unlaign_wh(ptest, 16, b, wh) + me_mv_cost(v, mv_pred, qp);
+ if (sad_test < min_sad)
+ {
+ min_sad = sad_test;
+ vbest = v;
+ pbest = ptest;
+ }
+ }
+
+ *mv = vbest;
+ *ppbest = pbest;
+ }
+ return min_sad;
+}
+
+/**
+* Set range for MV search
+*/
+static void me_mv_set_range(point_t *pnt, rectangle_t *range, const rectangle_t *mv_limit, int mby)
+{
+ // clip start point
+ rectangle_t r = *mv_limit;
+ r.tl.s.y = (int16_t)(MAX(r.tl.s.y, mby - 63*4));
+ r.br.s.y = (int16_t)(MIN(r.br.s.y, mby + 63*4));
+ mv_clip(pnt, &r);
+ range->tl = mv_add(*pnt, point(-MV_RANGE*4, -MV_RANGE*4));
+ range->br = mv_add(*pnt, point(+MV_RANGE*4, +MV_RANGE*4));
+ // clip search range
+ mv_clip(&range->tl, &r);
+ mv_clip(&range->br, &r);
+}
+
+/**
+* Remove duplicates from MV candidates list
+*/
+static int me_mv_refine_cand(point_t *p, int n)
+{
+ int i, j, k;
+ p[0] = mv_round_qpel(p[0]);
+ for (j = 1, k = 1; j < n; j++)
+ {
+ point_t mv = mv_round_qpel(p[j]);
+ for (i = 0; i < k; i++)
+ {
+ // TODO
+ //if (!mv_differs3(mv, p[i], 3*4))
+ //if (!mv_differs3(mv, p[i], 1*4))
+ //if (!mv_differs3(mv, p[i], 3))
+ if (mv_equal(mv, p[i]))
+ break;
+ }
+ if (i == k)
+ p[k++] = mv;
+ }
+ return k;
+}
+
+/**
+* Choose candidates for inter MB partitioning (16x8,8x16 or 8x8),
+* using SAD's for 8x8 sub-blocks
+*/
+static void mb_inter_partition(/*const */int sad[4], int mode[4])
+{
+/*
+ slope
+ |[ 1 1]| _ |[ 1 -1]|
+ |[-1 -1]| |[ 1 -1]|
+ indicates v/h gradient: big negative = vertical prediction; big positive = horizontal
+
+ skew
+ |[ 1 0]| _ |[ 0 -1]|
+ |[ 0 -1]| |[ 1 0]|
+ indicates diagonal gradient: big negative = diagonal down right
+*/
+ int p00 = sad[0];
+ int p01 = sad[1];
+ int p10 = sad[2];
+ int p11 = sad[3];
+ int sum = p00 + p01 + p10 + p11;
+ int slope = ABS((p00 - p10) + (p01 - p11)) - ABS((p00 - p01) + (p10 - p11));
+ int skew = ABS(p11 - p00) - ABS(p10 - p01);
+
+ if (slope > (sum >> 4))
+ {
+ mode[1] = 1; // try 8x16 partition
+ }
+ if (slope < -(sum >> 4))
+ {
+ mode[2] = 1; // try 16x8 partition
+ }
+ if (ABS(skew) > (sum >> 4) && ABS(slope) <= (sum >> 4))
+ {
+ mode[3] = 1; // try 8x8 partition
+ }
+}
+
+/**
+* Online MV clustering to "long" and "short" clusters
+* Estimate mean "long" and "short" vectors
+*/
+static void mv_clusters_update(h264e_enc_t *enc, point_t mv)
+{
+ int mv_norm = SQRP(mv);
+ int n0 = SQRP(enc->mv_clusters[0]);
+ int n1 = SQRP(enc->mv_clusters[1]);
+ if (mv_norm < n1)
+ {
+ // "short" is shorter than "long"
+ SMOOTH(enc->mv_clusters[0], mv);
+ }
+ if (mv_norm >= n0)
+ {
+ // "long" is longer than "short"
+ SMOOTH(enc->mv_clusters[1], mv);
+ }
+}
+
+/**
+* Choose inter mode: skip/coded, ME partition, find MV
+*/
+static void inter_choose_mode(h264e_enc_t *enc)
+{
+ int prefered_modes[4] = { 1, 0, 0, 0 };
+ point_t mv_skip, mv_skip_a, mv_cand[MAX_MV_CAND];
+ point_t mv_pred_16x16 = me_mv_medianpredictor_get_skip(enc);
+ point_t mv_best = point(MV_NA, 0); // avoid warning
+
+ int sad, sad_skip = 0x7FFFFFFF, sad_best = 0x7FFFFFFF;
+ int off, i, j = 0, ncand = 0;
+ int cand_sad4[MAX_MV_CAND][4];
+ const pix_t *ref_yuv = enc->ref.yuv[0];
+ int ref_stride = enc->ref.stride[0];
+ int mv_cand_cost_best = 0;
+ mv_skip = enc->mb.mv_skip_pred;
+ mv_skip_a = mb_abs_mv(enc, mv_skip);
+
+ for (i = 0; i < 4; i++)
+ {
+ enc->df.df_mv[4 + 5*i].u32 = enc->mv_pred[i].u32;
+ enc->df.df_mv[i].u32 = enc->mv_pred[8 + 4*enc->mb.x + i].u32;
+ }
+
+ // Try skip mode
+ if (mv_in_rect(mv_skip_a, &enc->frame.mv_qpel_limit))
+ {
+ int *sad4 = cand_sad4[0];
+ interpolate_luma(ref_yuv, ref_stride, mv_skip_a, point(16, 16), enc->ptest);
+ sad_skip = h264e_sad_mb_unlaign_8x8(enc->scratch->mb_pix_inp, 16, enc->ptest, sad4);
+
+ if (MAX(MAX(sad4[0], sad4[1]), MAX(sad4[2], sad4[3])) < g_skip_thr_inter[enc->rc.qp])
+ {
+ int uv, sad_uv;
+
+ SWAP(pix_t*, enc->pbest, enc->ptest);
+ enc->mb.type = -1;
+ enc->mb.mv[0] = mv_skip;
+ enc->mb.cost = 0;
+ interpolate_chroma(enc, mv_skip_a);
+
+ // Check that chroma SAD is not too big for the skip
+ for (uv = 1; uv <= 2; uv++)
+ {
+ pix_t *pred = enc->ptest + (uv - 1)*8;
+ pix_t *pix_mb_uv = mb_input_chroma(enc, uv);
+ int inp_stride = enc->inp.stride[uv];
+
+ if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16 > enc->param.width || (enc->mb.y + 1)*16 > enc->param.height))
+ {
+ // Speculative read beyond frame borders: make local copy of the macroblock.
+ // TODO: same code used in mb_write() and mb_encode()
+ pix_copy_cropped_mb(enc->scratch->mb_pix_store, 8, pix_mb_uv, enc->inp.stride[uv],
+ MIN(8, enc->param.width/2 - enc->mb.x*8),
+ MIN(8, enc->param.height/2 - enc->mb.y*8));
+ pix_mb_uv = enc->scratch->mb_pix_store;
+ inp_stride = 8;
+ }
+
+ sad_uv = h264e_sad_mb_unlaign_wh(pix_mb_uv, inp_stride, pred, point(8, 8));
+ if (sad_uv >= g_skip_thr_inter[enc->rc.qp])
+ {
+ break;
+ }
+ }
+ if (uv == 3)
+ {
+ return;
+ }
+ }
+
+ if (enc->run_param.encode_speed < 1) // enable 8x16, 16x8 and 8x8 partitions
+ {
+ mb_inter_partition(sad4, prefered_modes);
+ }
+
+ //sad_skip += me_mv_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+
+ // Too big skip SAD. Use skip predictor as a diamond start point candidate
+ mv_best = mv_round_qpel(mv_skip);
+ mv_cand[ncand++] = mv_best;
+ if (!((mv_skip.s.x | mv_skip.s.y) & 3))
+ {
+ sad_best = sad_skip;//+ me_mv_cost(mv_best, mv_pred_16x16, enc->rc.qp)
+ mv_cand_cost_best = me_mv_cand_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+ //mv_cand_cost_best = me_mv_cand_cost(mv_skip, point(0,0), enc->rc.qp);
+ j = 1;
+ }
+ }
+
+ mv_cand[ncand++] = mv_pred_16x16;
+ ncand += me_mv_medianpredictor_get_cand(enc, mv_cand + ncand);
+
+ if (enc->mb.x <= 0)
+ {
+ mv_cand[ncand++] = point(8*4, 0);
+ }
+ if (enc->mb.y <= 0)
+ {
+ mv_cand[ncand++] = point(0, 8*4);
+ }
+
+ mv_cand[ncand++] = enc->mv_clusters[0];
+ mv_cand[ncand++] = enc->mv_clusters[1];
+
+ assert(ncand <= MAX_MV_CAND);
+ ncand = me_mv_refine_cand(mv_cand, ncand);
+
+ for (/*j = 0*/; j < ncand; j++)
+ {
+ point_t mv = mb_abs_mv(enc, mv_cand[j]);
+ if (mv_in_rect(mv, &enc->frame.mv_limit))
+ {
+ int mv_cand_cost = me_mv_cand_cost(mv_cand[j], mv_pred_16x16, enc->rc.qp);
+
+ int *sad4 = cand_sad4[j];
+ off = ((mv.s.y + 0) >> 2)*ref_stride + ((mv.s.x + 0) >> 2);
+ sad = h264e_sad_mb_unlaign_8x8(ref_yuv + off, ref_stride, enc->scratch->mb_pix_inp, sad4);
+
+ if (enc->run_param.encode_speed < 1) // enable 8x16, 16x8 and 8x8 partitions
+ {
+ mb_inter_partition(sad4, prefered_modes);
+ }
+
+ if (sad + mv_cand_cost < sad_best + mv_cand_cost_best)
+ //if (sad < sad_best)
+ {
+ mv_cand_cost_best = mv_cand_cost;
+ sad_best = sad;
+ mv_best = mv_cand[j];
+ }
+ }
+ }
+
+ sad_best += me_mv_cost(mv_best, mv_pred_16x16, enc->rc.qp);
+
+ {
+ int mb_type;
+ point_t wh, part, mvpred_ctx[12], part_mv[4][16], part_mvd[4][16];
+ pix_t *store = enc->scratch->mb_pix_store;
+ pix_t *pred_best = store, *pred_test = store + 256;
+
+#define MAX8X8_MODES 4
+ me_mv_medianpredictor_save_ctx(enc, mvpred_ctx);
+ enc->mb.cost = 0xffffff;
+ for (mb_type = 0; mb_type < MAX8X8_MODES; mb_type++)
+ {
+ static const int nbits[4] = { 1, 4, 4, 12 };
+ int imv = 0;
+ int part_sad = MUL_LAMBDA(nbits[mb_type], g_lambda_q4[enc->rc.qp]);
+
+ if (!prefered_modes[mb_type]) continue;
+
+ wh.s.x = (mb_type & 2) ? 8 : 16;
+ wh.s.y = (mb_type & 1) ? 8 : 16;
+ part = point(0, 0);
+ for (;;)
+ {
+ rectangle_t range;
+ pix_t *diamond_out;
+ point_t mv, mv_pred, mvabs = mb_abs_mv(enc, mv_best);
+ me_mv_set_range(&mvabs, &range, &enc->frame.mv_limit, enc->mb.y*16*4 + part.s.y*4);
+
+ mv_pred = me_mv_medianpredictor_get(enc, part, wh);
+
+ if (mb_type)
+ {
+ mvabs = mv_round_qpel(mb_abs_mv(enc, mv_pred));
+ me_mv_set_range(&mvabs, &range, &enc->frame.mv_limit, enc->mb.y*16*4 + part.s.y*4);
+ off = ((mvabs.s.y >> 2) + part.s.y)*ref_stride + ((mvabs.s.x >> 2) + part.s.x);
+ sad_best = h264e_sad_mb_unlaign_wh(ref_yuv + off, ref_stride, enc->scratch->mb_pix_inp + part.s.y*16 + part.s.x, wh)
+ + me_mv_cost(mvabs,
+ //mv_pred,
+ mb_abs_mv(enc, mv_pred),
+ enc->rc.qp);
+ }
+
+ part_sad += me_search_diamond(enc, ref_yuv + part.s.y*ref_stride + part.s.x,
+ enc->scratch->mb_pix_inp + part.s.y*16 + part.s.x, ref_stride, &mvabs, &range, enc->rc.qp,
+ mb_abs_mv(enc, mv_pred), sad_best, wh,
+ store, &diamond_out, mb_type ? (mb_type == 2 ? 8 : 128) : 256);
+
+ if (!mb_type)
+ {
+ pred_test = diamond_out;
+ if (pred_test < store + 2*256)
+ {
+ pred_best = (pred_test == store ? store + 256 : store);
+ store += 2*256;
+ } else
+ {
+ pred_best = (pred_test == (store + 512) ? store + 512 + 256 : store + 512);
+ }
+ } else
+ {
+ h264e_copy_8x8(pred_test + part.s.y*16 + part.s.x, 16, diamond_out);
+ if (mb_type < 3)
+ {
+ int part_off = (wh.s.x >> 4)*8 + (wh.s.y >> 4)*8*16;
+ h264e_copy_8x8(pred_test + part_off + part.s.y*16 + part.s.x, 16, diamond_out + part_off);
+ }
+ }
+
+ mv = mv_sub(mvabs, point(enc->mb.x*16*4, enc->mb.y*16*4));
+
+ part_mvd[mb_type][imv] = mv_sub(mv, mv_pred);
+ part_mv[mb_type][imv++] = mv;
+
+ me_mv_medianpredictor_put(enc, part.s.x >> 2, part.s.y >> 2, wh.s.x >> 2, wh.s.y >> 2, mv);
+
+ part.s.x = (part.s.x + wh.s.x) & 15;
+ if (!part.s.x)
+ {
+ part.s.y = (part.s.y + wh.s.y) & 15;
+ if (!part.s.y) break;
+ }
+ }
+
+ me_mv_medianpredictor_restore_ctx(enc, mvpred_ctx);
+
+ if (part_sad < enc->mb.cost)
+ {
+ SWAP(pix_t*, pred_best, pred_test);
+ enc->mb.cost = part_sad;
+ enc->mb.type = mb_type;
+ }
+ }
+ enc->pbest = pred_best;
+ enc->ptest = pred_test;
+ memcpy(enc->mb.mv, part_mv [enc->mb.type], 16*sizeof(point_t));
+ memcpy(enc->mb.mvd, part_mvd[enc->mb.type], 16*sizeof(point_t));
+
+ if (enc->mb.cost > sad_skip)
+ {
+ enc->mb.type = 0;
+ enc->mb.cost = sad_skip + me_mv_cand_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+ enc->mb.mv [0] = mv_skip;
+ enc->mb.mvd[0] = mv_sub(mv_skip, mv_pred_16x16);
+
+ assert(mv_in_rect(mv_skip_a, &enc->frame.mv_qpel_limit)) ;
+ interpolate_luma(ref_yuv, ref_stride, mv_skip_a, point(16, 16), enc->pbest);
+ interpolate_chroma(enc, mv_skip_a);
+ }
+ }
+}
+
+/************************************************************************/
+/* Deblock filter */
+/************************************************************************/
+#define MB_FLAG_SVC_INTRA 1
+#define MB_FLAG_SLICE_START_DEBLOCK_2 2
+
+/**
+* Set deblock filter strength
+*/
+static void df_strength(deblock_filter_t *df, int mb_type, int mbx, uint8_t *strength, int IntraBLFlag)
+{
+ uint8_t *sv = strength;
+ uint8_t *sh = strength + 16;
+ int flag = df->nzflag;
+ df->df_nzflag[mbx] = (uint8_t)(flag >> 20);
+ /*
+ nzflag represents macroblock and it's neighbors with 24 bit flags:
+ 0 1 2 3
+ 4 5 6 7 8
+ A B C D E
+ F G H I J
+ K L K N O
+ */
+ (void)IntraBLFlag;
+#if H264E_SVC_API
+ if (IntraBLFlag & MB_FLAG_SVC_INTRA)
+ {
+ int ccloop = 4;
+ do
+ {
+ int cloop = 4;
+ do
+ {
+ int v = 0;
+ if (flag & 3 << 4)
+ {
+ v = 1;
+ }
+
+ *sv = (uint8_t)v; sv += 4;
+
+ v = 0;
+ if (flag & 33)
+ {
+ v = 1;
+ }
+
+ *sh++ = (uint8_t)v;
+
+ flag >>= 1;
+
+ } while(--cloop);
+ flag >>= 1;
+ sv -= 15;
+
+ } while(--ccloop);
+ } else
+#endif
+ {
+ if (mb_type < 5)
+ {
+ int ccloop = 4;
+ point_t *mv = df->df_mv;
+ do
+ {
+ int cloop = 4;
+ do
+ {
+ int v = 0;
+ if (flag & 3 << 4)
+ {
+ v = 2;
+ } else if (mv_differs3(mv[4], mv[5]))
+ {
+ v = 1;
+ }
+ *sv = (uint8_t)v; sv += 4;
+
+ v = 0;
+ if (flag & 33)
+ {
+ v = 2;
+ } else if (mv_differs3(mv[0], mv[5]))
+ {
+ v = 1;
+ }
+ *sh++ = (uint8_t)v;
+
+ flag >>= 1;
+ mv++;
+ } while(--cloop);
+ flag >>= 1;
+ sv -= 15;
+ mv++;
+ } while(--ccloop);
+ } else
+ {
+ // Deblock mode #3 (intra)
+ ((uint32_t*)(sv))[1] = ((uint32_t*)(sv))[2] = ((uint32_t*)(sv))[3] = // for inner columns
+ ((uint32_t*)(sh))[1] = ((uint32_t*)(sh))[2] = ((uint32_t*)(sh))[3] = 0x03030303; // for inner rows
+ }
+ if ((mb_type >= 5 || df->mb_type[mbx - 1] >= 5)) // speculative read
+ {
+ ((uint32_t*)(strength))[0] = 0x04040404; // Deblock mode #4 (strong intra) for left column
+ }
+ if ((mb_type >= 5 || df->mb_type[mbx ] >= 5))
+ {
+ ((uint32_t*)(strength))[4] = 0x04040404; // Deblock mode #4 (strong intra) for top row
+ }
+ }
+ df->mb_type[mbx] = (int8_t)mb_type;
+}
+
+/**
+* Run deblock for current macroblock
+*/
+static void mb_deblock(deblock_filter_t *df, int mb_type, int qp_this, int mbx, int mby, H264E_io_yuv_t *mbyuv, int IntraBLFlag)
+{
+ int i, cr, qp, qp_left, qp_top;
+ deblock_params_t par;
+ uint8_t *alpha = par.alpha; //[2*2];
+ uint8_t *beta = par.beta; //[2*2];
+ uint32_t *strength32 = par.strength32; //[4*2]; // == uint8_t strength[16*2];
+ uint8_t *strength = (uint8_t *)strength32;
+ uint8_t *tc0 = par.tc0; //[16*2];
+
+ df_strength(df, mb_type, mbx, strength, IntraBLFlag);
+ if (!mbx || (IntraBLFlag & MB_FLAG_SLICE_START_DEBLOCK_2))
+ {
+ strength32[0] = 0;
+ }
+
+ if (!mby)
+ {
+ strength32[4] = 0;
+ }
+
+ qp_top = df->df_qp[mbx];
+ qp_left = df->df_qp[mbx - 1];
+ df->df_qp[mbx] = (uint8_t)qp_this;
+
+ cr = 0;
+ for (;;)
+ {
+ const uint8_t *lut;
+ if (*((uint32_t*)strength))
+ {
+ qp = (qp_left + qp_this + 1) >> 1;
+ lut = g_a_tc0_b[-10 + qp + ALPHA_OFS];
+ alpha[0] = lut[0];
+ beta[0] = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+ for (i = 0; i < 4; i++) tc0[i] = lut[strength[i]];
+ }
+ if (*((uint32_t*)(strength + 16)))
+ {
+ qp = (qp_top + qp_this + 1) >> 1;
+ lut = g_a_tc0_b[-10 + qp + ALPHA_OFS];
+
+ alpha[2] = lut[0];
+ beta[2] = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+ for (i = 0; i < 4; i++) tc0[16 + i] = lut[strength[16 + i]];
+ }
+
+ lut = g_a_tc0_b[-10 + qp_this + ALPHA_OFS];
+ alpha[3] = alpha[1] = lut[0];
+ beta[3] = beta[1] = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+ for (i = 4; i < 16; i++)
+ {
+ tc0[i] = lut[strength[i]];
+ tc0[16 + i] = lut[strength[16 + i]];
+ }
+ if (cr)
+ {
+ int *t = (int *)tc0;
+ t[1] = t[2]; // TODO: need only for OMX
+ t[5] = t[6];
+ i = 2;
+ do
+ {
+ h264e_deblock_chroma(mbyuv->yuv[i], mbyuv->stride[i], &par);
+ } while (--i);
+ break;
+ }
+ h264e_deblock_luma(mbyuv->yuv[0], mbyuv->stride[0], &par);
+
+ qp_this = qpy2qpc[qp_this + DQP_CHROMA];
+ qp_left = qpy2qpc[qp_left + DQP_CHROMA];
+ qp_top = qpy2qpc[qp_top + DQP_CHROMA];
+ cr++;
+ }
+}
+
+/************************************************************************/
+/* Macroblock encoding */
+/************************************************************************/
+/**
+* Macroblock encoding
+*/
+static void mb_encode(h264e_enc_t *enc, int enc_type)
+{
+ pix_t *top = enc->top_line + 48 + enc->mb.x*32;
+ pix_t *left = enc->top_line;
+ int avail = enc->mb.avail = mb_avail_flag(enc);
+ int base_mode = 0;
+
+ if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16 > enc->param.width || (enc->mb.y + 1)*16 > enc->param.height))
+ {
+ pix_copy_cropped_mb(enc->scratch->mb_pix_inp, 16, mb_input_luma(enc), enc->inp.stride[0],
+ MIN(16, enc->param.width - enc->mb.x*16),
+ MIN(16, enc->param.height - enc->mb.y*16));
+ } else
+ {
+ // cache input macroblock
+ h264e_copy_16x16(enc->scratch->mb_pix_inp, 16, mb_input_luma(enc), enc->inp.stride[0]);
+ }
+
+ if (!(avail & AVAIL_L)) left = NULL;
+ if (!(avail & AVAIL_T)) top = NULL;
+
+ enc->pbest = enc->scratch->mb_pix_store;
+ enc->ptest = enc->pbest + 256;
+ enc->mb.type = 0;
+ enc->mb.cost = 0x7FFFFFFF;
+
+ if (enc->slice.type == SLICE_TYPE_P)
+ {
+ inter_choose_mode(enc);
+ }
+#if H264E_SVC_API
+ else if (enc_type > 0 && enc->param.inter_layer_pred_flag)
+ {
+ base_mode = 1;
+ enc->mb.type = 6;
+ h264e_copy_16x16(enc->pbest, 16, (enc->ref.yuv[0] + (enc->mb.x + enc->mb.y*enc->ref.stride[0])*16), enc->ref.stride[0]);
+ h264e_copy_8x8_s(enc->ptest, 16, (enc->ref.yuv[1] + (enc->mb.x + enc->mb.y*enc->ref.stride[1])*8), enc->ref.stride[1]);
+ h264e_copy_8x8_s(enc->ptest + 8, 16, (enc->ref.yuv[2] + (enc->mb.x + enc->mb.y*enc->ref.stride[2])*8), enc->ref.stride[2]);
+
+ goto _WRITE_MB;
+ }
+#endif
+
+ if (enc->mb.type >= 0)
+ {
+ intra_choose_16x16(enc, left, top, avail);
+ if (enc->run_param.encode_speed < 2 || enc->slice.type != SLICE_TYPE_P) // enable intra4x4 on P slices
+ {
+ intra_choose_4x4(enc);
+ }
+ }
+
+ if (enc->mb.type < 5)
+ {
+ mv_clusters_update(enc, enc->mb.mv[0]);
+ }
+
+ if (enc->mb.type >= 5)
+ {
+ pix_t *pred = enc->ptest;
+ h264e_intra_predict_chroma(pred, left + 16, top + 16, enc->mb.i16.pred_mode_luma);
+ } else
+ {
+ interpolate_chroma(enc, mb_abs_mv(enc, enc->mb.mv[0]));
+ }
+
+#if H264E_SVC_API
+_WRITE_MB:
+#endif
+ mb_write(enc, enc_type, base_mode);
+
+ if (!enc->speed.disable_deblock)
+ {
+ int mbx = enc->mb.x;
+ int mby = enc->mb.y;
+#if H264E_MAX_THREADS
+ if (enc->param.max_threads > 1)
+ { // Avoid deblock across slice border
+ if (enc->mb.num < enc->slice.start_mb_num + enc->frame.nmbx)
+ mby = 0;
+ if (enc->mb.num == enc->slice.start_mb_num)
+ {
+ base_mode |= MB_FLAG_SLICE_START_DEBLOCK_2;
+ }
+ }
+#endif
+ mb_deblock(&enc->df, enc->mb.type, enc->rc.prev_qp, mbx, mby, &enc->dec, base_mode);
+ }
+}
+
+
+/************************************************************************/
+/* Rate-control */
+/************************************************************************/
+
+/**
+* @return zero threshold for given rounding offset
+*/
+static uint16_t rc_rnd2thr(int round, int q)
+{
+ int b, thr = 0;
+ for (b = 0x8000; b; b >>= 1)
+ {
+ int t = (thr | b)*q;
+ if (t <= 0x10000 - round) // TODO: error: < !!!!!!!
+ {
+ thr |= b;
+ }
+ }
+ return (uint16_t)thr;
+}
+
+/**
+* Set quantizer constants (deadzone and rounding) for given QP
+*/
+static void rc_set_qp(h264e_enc_t *enc, int qp)
+{
+ qp = MIN(qp, enc->run_param.qp_max);
+ qp = MAX(qp, enc->run_param.qp_min);
+ qp = MIN(qp, 51); // avoid VC2010 static analyzer warning
+
+ if (enc->rc.qp != qp)
+ {
+ static const int16_t g_quant_coeff[6*6] =
+ {
+ // 0 2 1
+ 13107, 10, 8066, 13, 5243, 16,
+ 11916, 11, 7490, 14, 4660, 18,
+ 10082, 13, 6554, 16, 4194, 20,
+ 9362, 14, 5825, 18, 3647, 23,
+ 8192, 16, 5243, 20, 3355, 25,
+ 7282, 18, 4559, 23, 2893, 29
+ // 0 2 0 2
+ // 2 1 2 1
+ // 0 2 0 2
+ // 2 1 2 1
+ };
+
+ int cloop = 2;
+ enc->rc.qp = qp;
+
+ do
+ {
+ uint16_t *qdat0 = enc->rc.qdat[2 - cloop];
+ uint16_t *qdat = enc->rc.qdat[2 - cloop];
+ int qp_div6 = qp*86 >> 9;
+ int qp_mod6 = qp - qp_div6*6;
+ const int16_t *quant_coeff = g_quant_coeff + qp_mod6*6; // TODO: need calculate qp%6*6
+ int i = 3;
+
+ // Quant/dequant multiplier
+ do
+ {
+ *qdat++ = *quant_coeff++ << 1 >> qp_div6;
+ *qdat++ = *quant_coeff++ << qp_div6;
+ } while(--i);
+
+ // quantizer deadzone for P & chroma
+ *qdat++ = enc->slice.type == SLICE_TYPE_P ? g_rnd_inter[qp] : g_deadzonei[qp];
+ // quantizer deadzone for I
+ *qdat++ = g_deadzonei[qp];
+
+ *qdat++ = g_thr_inter[qp] - 0x7fff;
+ *qdat++ = g_thr_inter2[qp] - 0x7fff;
+
+ qdat[0] = qdat[2] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[0]);
+ qdat[1] = qdat[3] =
+ qdat[4] = qdat[6] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[2]);
+ qdat[5] = qdat[7] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[4]);
+ qdat += 8;
+ qdat[0] = qdat[2] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[0]);
+ qdat[1] = qdat[3] =
+ qdat[4] = qdat[6] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[2]);
+ qdat[5] = qdat[7] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[4]);
+ qdat += 8;
+ qdat[0] = qdat[2] = qdat0[0];
+ qdat[1] = qdat[3] =
+ qdat[4] = qdat[6] = qdat0[2];
+ qdat[5] = qdat[7] = qdat0[4];
+ qdat += 8;
+ qdat[0] = qdat[2] = qdat0[1];
+ qdat[1] = qdat[3] =
+ qdat[4] = qdat[6] = qdat0[3];
+ qdat[5] = qdat[7] = qdat0[5];
+
+ qp = qpy2qpc[qp + DQP_CHROMA];
+ } while (--cloop);
+ }
+}
+
+/**
+* Estimate frame bit budget and QP
+*
+* How bit budget allocated?
+* ~~~~~~~~~~~~~~~~~~~~~~~~~
+* 1. Estimate target size of I and P macroblock, assuming same quality
+* 2. Estimate I peak size
+* 3. Estimate desired stationary VBV level
+*
+*/
+static int rc_frame_start(h264e_enc_t *enc, int is_intra, int is_refers_to_long_term)
+{
+ unsigned np = MIN(enc->param.gop - 1u, 63u);
+ int nmb = enc->frame.nmb;
+
+ int qp = -1, add_bits, bit_budget = enc->run_param.desired_frame_bytes*8;
+ int nominal_p, gop_bits, stationary_vbv_level;
+ uint32_t peak_factor_q16;
+
+ // Estimate QP
+ do
+ {
+ qp++;
+ gop_bits = bits_per_mb[0][qp]*np + bits_per_mb[1][qp];
+ } while (gop_bits*nmb > (int)(np + 1)*enc->run_param.desired_frame_bytes*8 && qp < 40);
+
+ /*
+ * desired*gop = i + p*(gop-1); i/p = alpha;
+ * p = desired * gop / (gop-1+alpha) and i = p*alpha or i = (desired-p)*gop + p;
+ */
+ peak_factor_q16 = div_q16(bits_per_mb[1][qp] << 16, bits_per_mb[0][qp] << 16);
+ if (np)
+ {
+ uint32_t ratio_q16 = div_q16((np + 1) << 16, (np << 16) + peak_factor_q16);
+ nominal_p = mul32x32shr16(enc->run_param.desired_frame_bytes*8, ratio_q16);
+ } else
+ {
+ nominal_p = 0;
+ }
+
+ stationary_vbv_level = MIN(enc->param.vbv_size_bytes*8 >> 4, enc->run_param.desired_frame_bytes*8);
+
+ if (is_intra)
+ {
+ int nominal_i = mul32x32shr16(nominal_p, peak_factor_q16);
+ add_bits = nominal_i - bit_budget;
+ }
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+ else if (is_refers_to_long_term)
+ {
+ int d_qp = enc->rc.max_dqp - enc->rc.dqp_smooth;
+ unsigned peak_factor_golden_q16;
+ int nominal_golden;
+ d_qp = MAX(d_qp, 2);
+ d_qp = MIN(d_qp, 12);
+ d_qp = d_qp * 4 * 85 >> 8;//* 16 / 12;
+
+ peak_factor_golden_q16 = (peak_factor_q16 - (1 << 16)) * d_qp >> 4;
+ nominal_golden = nominal_p + mul32x32shr16(nominal_p, peak_factor_golden_q16);
+ add_bits = nominal_golden - bit_budget;
+ }
+#endif
+ else
+ {
+ add_bits = nominal_p - bit_budget;
+
+ // drift to stationary level
+ if (enc->param.vbv_size_bytes)
+ {
+ add_bits += (enc->rc.vbv_target_level - enc->rc.vbv_bits) >> 4;
+ }
+ }
+ if (enc->param.vbv_size_bytes)
+ {
+ add_bits = MIN(add_bits, (enc->param.vbv_size_bytes*8*7 >> 3) - enc->rc.vbv_bits);
+ }
+
+ bit_budget += add_bits;
+ bit_budget = MIN(bit_budget, enc->run_param.desired_frame_bytes*8*16);
+ bit_budget = MAX(bit_budget, enc->run_param.desired_frame_bytes*8 >> 2);
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+ if (is_intra || is_refers_to_long_term)
+#else
+ if (is_intra)
+#endif
+ {
+ // Increase VBV target level due to to I-frame load: this avoids QP adaptation after I-frame
+ enc->rc.vbv_target_level = enc->rc.vbv_bits + bit_budget - enc->run_param.desired_frame_bytes*8;
+ }
+
+ // Slow drift of VBV target to stationary level...
+ enc->rc.vbv_target_level -= enc->run_param.desired_frame_bytes*8 - nominal_p;
+
+ // ...until stationary level reached
+ enc->rc.vbv_target_level = MAX(enc->rc.vbv_target_level, stationary_vbv_level);
+
+ enc->rc.bit_budget = bit_budget;
+
+ if (enc->param.fine_rate_control_flag && enc->frame.num)
+ {
+ qp = enc->rc.qp_smooth >> 8;
+ } else
+ {
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+ if (is_refers_to_long_term)
+ {
+ for (qp = 0; qp < 42 - 1; qp++)
+ {
+ //if (((bits_per_mb[0][qp] + bits_per_mb[1][qp]) >> 1)*nmb < bit_budget)
+ if (((bits_per_mb[0][qp] + bits_per_mb[1][qp]) >> 1)*nmb < bit_budget)
+ break;
+ }
+ } else
+#endif
+ {
+ const uint16_t *bits = bits_per_mb[!!is_intra];
+ for (qp = 0; qp < 42 - 1; qp++)
+ {
+ if (bits[qp]*nmb < bit_budget)
+ {
+ break;
+ }
+ }
+ }
+ qp += MIN_QP;
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+ if (is_refers_to_long_term)
+ {
+ int dqp = MAX(enc->rc.max_dqp, enc->rc.dqp_smooth);
+ dqp = MIN(dqp, enc->rc.dqp_smooth + 6);
+ qp += dqp;
+ qp = MAX(enc->rc.prev_qp, qp);
+ } else
+#endif
+ {
+ qp += enc->rc.dqp_smooth;
+ }
+
+ // If reference frame has high qp, motion compensation is less effective, so qp should be increased
+ if (enc->rc.prev_qp > qp + 1)
+ {
+ qp = (enc->rc.prev_qp + qp + 1)/2;
+ }
+ }
+
+ enc->rc.qp = 0; // force
+ rc_set_qp(enc, qp);
+ qp = enc->rc.qp;
+
+ enc->rc.qp_smooth = qp << 8;
+ enc->rc.prev_qp = qp;
+
+ return (enc->rc.vbv_bits > enc->param.vbv_size_bytes*8);
+}
+
+/**
+* Update rate-control state after frame encode
+*/
+static void rc_frame_end(h264e_enc_t *enc, int intra_flag, int skip_flag, int is_refers_to_long_term)
+{
+ // 1. Update QP offset adaptive adjustment
+ if (!skip_flag /*&& !is_refers_to_long_term*/)
+ {
+ int qp, nmb = enc->frame.nmb;
+ // a posterior qp estimation
+ for (qp = 0; qp != 41 && bits_per_mb[intra_flag][qp]*nmb > (int)enc->out_pos*8 - 32; qp++) {/*no action*/}
+
+ qp += MIN_QP;
+
+ if (!is_refers_to_long_term)
+ {
+ if ((enc->rc.qp_smooth >> 8) - enc->rc.dqp_smooth < qp - 1)
+ {
+ enc->rc.dqp_smooth--;
+ } else if ((enc->rc.qp_smooth >> 8) - enc->rc.dqp_smooth > qp + 1)
+ {
+ enc->rc.dqp_smooth++;
+ }
+ }
+ if (intra_flag || is_refers_to_long_term)
+ {
+ enc->rc.max_dqp = enc->rc.dqp_smooth;
+ } else
+ {
+ enc->rc.max_dqp = MAX(enc->rc.max_dqp, (enc->rc.qp_smooth >> 8) - qp);
+ }
+ }
+
+ // 2. Update VBV model state
+ enc->rc.vbv_bits += enc->out_pos*8 - enc->run_param.desired_frame_bytes*8;
+
+ // 3. If VBV model used, handle overflow/underflow
+ if (enc->param.vbv_size_bytes)
+ {
+ if (enc->rc.vbv_bits < 0) // VBV underflow
+ {
+ if (enc->param.vbv_underflow_stuffing_flag)
+ {
+ // put stuffing ('filler data')
+ nal_start(enc, 12); // filler_data_rbsp
+ do
+ {
+ U(8, 0xFF);
+ enc->rc.vbv_bits += 8;
+ } while (enc->rc.vbv_bits < 0);
+ nal_end(enc);
+ } else
+ {
+ // ignore underflow
+ enc->rc.vbv_bits = 0;
+ }
+ }
+ if (enc->rc.vbv_bits > enc->param.vbv_size_bytes*8) // VBV overflow
+ {
+ if (!enc->param.vbv_overflow_empty_frame_flag)
+ {
+ // ignore overflow
+ enc->rc.vbv_bits = enc->param.vbv_size_bytes*8;
+ }
+ }
+ } else
+ {
+ enc->rc.vbv_bits = 0;
+ }
+}
+
+/**
+* Update rate-control state after macroblock encode, set QP for next MB
+*/
+static void rc_mb_end(h264e_enc_t *enc)
+{
+ // used / ncoded = budget/total
+ int bits_coded = h264e_bs_get_pos_bits(enc->bs) + enc->out_pos*8 + 1;
+ int mb_coded = enc->mb.num; // after increment: 1, 2....
+ int err = bits_coded *enc->frame.nmb - enc->rc.bit_budget*mb_coded;
+ int d_err = err - enc->rc.prev_err;
+ int qp = enc->rc.qp;
+ assert(enc->mb.num);
+ enc->rc.prev_err = err;
+
+ if (err > 0 && d_err > 0)
+ { // Increasing risk of overflow
+ if (enc->rc.stable_count < 3)
+ {
+ qp++; // State not stable: increase QP
+ }
+ enc->rc.stable_count = 0; // Set state to "not stable"
+ } else if (err < 0 && d_err < 0)
+ { // Increasing risk of underlow
+ if (enc->rc.stable_count < 3)
+ {
+ qp--;
+ }
+ enc->rc.stable_count = 0;
+ } else
+ { // Stable state
+ enc->rc.stable_count++;
+ }
+ enc->rc.qp_smooth += qp - (enc->rc.qp_smooth >> 8);
+ qp = MIN(qp, enc->rc.prev_qp + 3);
+ qp = MAX(qp, enc->rc.prev_qp - 3);
+ rc_set_qp(enc, qp);
+}
+
+/************************************************************************/
+/* Top-level API */
+/************************************************************************/
+
+#define ALIGN_128BIT(p) (void *)((uintptr_t)(((char*)(p)) + 15) & ~(uintptr_t)15)
+#define ALLOC(ptr, size) p = ALIGN_128BIT(p); if (enc) ptr = (void *)p; p += size;
+
+/**
+* Internal allocator for persistent RAM
+*/
+static int enc_alloc(h264e_enc_t *enc, const H264E_create_param_t *par, unsigned char *p, int inp_buf_flag)
+{
+ unsigned char *p0 = p;
+ int nmbx = (par->width + 15) >> 4;
+ int nmby = (par->height + 15) >> 4;
+ int nref_frames = 1 + par->max_long_term_reference_frames + par->const_input_flag;
+#if H264E_ENABLE_DENOISE
+ nref_frames += !!par->temporal_denoise_flag;
+#endif
+ ALLOC(enc->ref.yuv[0], ((nmbx + 2) * (nmby + 2) * 384) * nref_frames);
+ (void)inp_buf_flag;
+#if H264E_SVC_API
+ if (inp_buf_flag)
+ {
+ ALLOC(enc->inp.yuv[0], ((nmbx)*(nmby)*384)); /* input buffer for base laeyr */
+ }
+#endif
+ return (int)((p - p0) + 15) & ~15u;
+}
+
+/**
+* Internal allocator for scratch RAM
+*/
+static int enc_alloc_scratch(h264e_enc_t *enc, const H264E_create_param_t *par, unsigned char *p)
+{
+ unsigned char *p0 = p;
+ int nmbx = (par->width + 15) >> 4;
+ int nmby = (par->height + 15) >> 4;
+ ALLOC(enc->scratch, sizeof(scratch_t));
+ ALLOC(enc->out, nmbx * nmby * (384 + 2 + 10) * 3/2);
+
+ ALLOC(enc->nnz, nmbx*8 + 8);
+ ALLOC(enc->mv_pred, (nmbx*4 + 8)*sizeof(point_t));
+ ALLOC(enc->i4x4mode, nmbx*4 + 4);
+ ALLOC(enc->df.df_qp, nmbx);
+ ALLOC(enc->df.mb_type, nmbx);
+ ALLOC(enc->df.df_nzflag, nmbx);
+ ALLOC(enc->top_line, nmbx*32 + 32 + 16);
+ return (int)(p - p0);
+}
+
+/**
+* Setup H264E_io_yuv_t structures
+*/
+static pix_t *io_yuv_set_pointers(pix_t *base, H264E_io_yuv_t *frm, int w, int h)
+{
+ int s = w + (16 + 16); // guards
+ int i, guard = 16;
+ for (i = 0; i < 3; i++)
+ {
+ frm->stride[i] = s;
+ frm->yuv[i] = base + (s + 1)*guard;
+ base += s*(h + 2*guard);
+ if (!i) guard >>= 1, s >>= 1, h >>= 1;
+ }
+ return base;
+}
+
+/**
+* Verify encoder creation parameters. Return error code, or 0 if prameters
+*/
+static int enc_check_create_params(const H264E_create_param_t *par)
+{
+ if (!par)
+ {
+ return H264E_STATUS_BAD_ARGUMENT; // NULL argument
+ }
+ if ((int)(par->vbv_size_bytes | par->gop) < 0)
+ {
+ return H264E_STATUS_BAD_PARAMETER; // negative GOP or VBV size
+ }
+ if (par->width <= 0 || par->height <= 0)
+ {
+ return H264E_STATUS_BAD_PARAMETER; // non-positive frame size
+ }
+ if ((unsigned)(par->const_input_flag | par->fine_rate_control_flag |
+ par->vbv_overflow_empty_frame_flag | par->vbv_underflow_stuffing_flag) > 1)
+ {
+ return H264E_STATUS_BAD_PARAMETER; // Any flag is not 0 or 1
+ }
+ if ((unsigned)par->max_long_term_reference_frames > MAX_LONG_TERM_FRAMES)
+ {
+ return H264E_STATUS_BAD_PARAMETER; // Too many long-term reference frames requested
+ }
+ if ((par->width | par->height) & 1)
+ {
+ return H264E_STATUS_SIZE_NOT_MULTIPLE_2; // frame size must be multiple of 2
+ }
+ if (((par->width | par->height) & 15) && !par->const_input_flag)
+ {
+ // if input buffer reused as scratch (par->const_input_flag == 0)
+ // frame size must be multiple of 16
+ return H264E_STATUS_SIZE_NOT_MULTIPLE_16;
+ }
+ return H264E_STATUS_SUCCESS;
+};
+
+static int H264E_sizeof_one(const H264E_create_param_t *par, int *sizeof_persist, int *sizeof_scratch, int inp_buf_flag)
+{
+ int error = enc_check_create_params(par);
+ if (!sizeof_persist || !sizeof_scratch)
+ {
+ error = H264E_STATUS_BAD_ARGUMENT;
+ }
+ if (error)
+ {
+ return error;
+ }
+
+ *sizeof_persist = enc_alloc(NULL, par, (void*)(uintptr_t)1, inp_buf_flag) + sizeof(h264e_enc_t);
+#if H264E_MAX_THREADS > 1
+ *sizeof_scratch = enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1) * (par->max_threads + 1);
+#else
+ *sizeof_scratch = enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1);
+#endif
+ return error;
+}
+
+static int H264E_init_one(h264e_enc_t *enc, const H264E_create_param_t *opt, int inp_buf_flag)
+{
+ pix_t *base;
+#if H264E_CONFIGS_COUNT > 1
+ init_vft(opt->enableNEON);
+#endif
+ memset(enc, 0, sizeof(*enc));
+
+ enc->frame.nmbx = (opt->width + 15) >> 4;
+ enc->frame.nmby = (opt->height + 15) >> 4;
+ enc->frame.nmb = enc->frame.nmbx*enc->frame.nmby;
+ enc->frame.w = enc->frame.nmbx*16;
+ enc->frame.h = enc->frame.nmby*16;
+ enc->frame.mv_limit.tl = point(-MV_GUARD*4, -MV_GUARD*4);
+ enc->frame.mv_qpel_limit.tl = mv_add(enc->frame.mv_limit.tl, point(4*4, 4*4));
+ enc->frame.mv_limit.br = point((enc->frame.nmbx*16 - (16 - MV_GUARD))*4, (enc->frame.nmby*16 - (16 - MV_GUARD))*4);
+ enc->frame.mv_qpel_limit.br = mv_add(enc->frame.mv_limit.br, point(-4*4, -4*4));
+ enc->frame.cropping_flag = !!((opt->width | opt->height) & 15);
+ enc->param = *opt;
+
+ enc_alloc(enc, opt, (void*)(enc + 1), inp_buf_flag);
+
+#if H264E_SVC_API
+ if (inp_buf_flag)
+ {
+ enc->inp.yuv[1] = enc->inp.yuv[0] + enc->frame.w*enc->frame.h;
+ enc->inp.yuv[2] = enc->inp.yuv[1] + enc->frame.w*enc->frame.h/4;
+ enc->inp.stride[0] = enc->frame.w;
+ enc->inp.stride[1] = enc->frame.w/2;
+ enc->inp.stride[2] = enc->frame.w/2;
+ enc->dec = enc->inp;
+ }
+#endif
+
+ base = io_yuv_set_pointers(enc->ref.yuv[0], &enc->ref, enc->frame.nmbx*16, enc->frame.nmby*16);
+#if H264E_ENABLE_DENOISE
+ if (enc->param.temporal_denoise_flag)
+ {
+ pix_t *p = base;
+ base = io_yuv_set_pointers(base, &enc->denoise, enc->frame.nmbx*16, enc->frame.nmby*16);
+ while (p < base) *p++ = 0;
+ }
+#endif
+ if (enc->param.const_input_flag)
+ {
+ base = io_yuv_set_pointers(base, &enc->dec, enc->frame.nmbx*16, enc->frame.nmby*16);
+ }
+ if (enc->param.max_long_term_reference_frames)
+ {
+ H264E_io_yuv_t t;
+ int i;
+ for (i = 0; i < enc->param.max_long_term_reference_frames; i++)
+ {
+ base = io_yuv_set_pointers(base, &t, enc->frame.nmbx*16, enc->frame.nmby*16);
+ enc->lt_yuv[i][0] = t.yuv[0];
+ enc->lt_yuv[i][1] = t.yuv[1];
+ enc->lt_yuv[i][2] = t.yuv[2];
+ }
+ }
+ return H264E_STATUS_SUCCESS;
+}
+
+/**
+* Encoder initialization
+* See header file for details.
+*/
+int H264E_init(h264e_enc_t *enc, const H264E_create_param_t *opt)
+{
+ h264e_enc_t *enc_curr = enc;
+ int i, ret;
+ (void)i;
+
+ ret = H264E_init_one(enc_curr, opt, 0);
+
+#if H264E_SVC_API
+ for (i = opt->num_layers; i > 1; i--)
+ {
+ H264E_create_param_t opt_next = enc_curr->param;
+ int sizeof_persist = 0, sizeof_scratch = 0;
+
+ opt_next.const_input_flag = 0;
+ opt_next.temporal_denoise_flag = 0;
+ opt_next.width = opt_next.width >> 1;
+ opt_next.width += opt_next.width & 1;
+ opt_next.height = opt_next.height >> 1;
+ opt_next.height+= opt_next.height & 1;
+
+ opt_next.vbv_size_bytes <<= 2;
+
+ H264E_sizeof_one(&enc_curr->param, &sizeof_persist, &sizeof_scratch, 1);
+ enc_curr = enc_curr->enc_next = (char *)enc_curr + sizeof_persist;
+
+ ret = H264E_init_one(enc_curr, &opt_next, 1);
+ if (ret)
+ break;
+ }
+#endif
+ return ret;
+}
+
+static void encode_slice(h264e_enc_t *enc, int frame_type, int long_term_idx_use, int long_term_idx_update, int pps_id, int enc_type)
+{
+ int i, k;
+ encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id,enc_type);
+ // encode frame
+ do
+ { // encode row
+ do
+ { // encode macroblock
+ if (enc->run_param.desired_nalu_bytes &&
+ h264e_bs_get_pos_bits(enc->bs) > enc->run_param.desired_nalu_bytes*8u)
+ {
+ // start new slice
+ nal_end(enc);
+ encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type);
+ }
+
+ mb_encode(enc, enc_type);
+
+ enc->dec.yuv[0] += 16;
+ enc->dec.yuv[1] += 8;
+ enc->dec.yuv[2] += 8;
+
+ enc->mb.num++; // before rc_mb_end
+ if (enc->param.fine_rate_control_flag)
+ {
+ rc_mb_end(enc);
+ }
+ } while (++enc->mb.x < enc->frame.nmbx);
+
+ for (i = 0, k = 16; i < 3; i++, k = 8)
+ {
+ enc->dec.yuv[i] += k*(enc->dec.stride[i] - enc->frame.nmbx);
+ }
+
+ // start new row
+ enc->mb.x = 0;
+ *((uint32_t*)(enc->nnz)) = *((uint32_t*)(enc->nnz + 4)) = 0x01010101 * NNZ_NA; // left edge of NNZ predictor
+ enc->i4x4mode[0] = -1;
+
+ } while (++enc->mb.y < enc->frame.nmby);
+
+ if (enc->mb.skip_run)
+ {
+ UE(enc->mb.skip_run);
+ }
+
+ nal_end(enc);
+ for (i = 0, k = 16; i < 3; i++, k = 8)
+ {
+ enc->dec.yuv[i] -= k*enc->dec.stride[i]*enc->frame.nmby;
+ }
+}
+
+#if H264E_MAX_THREADS
+typedef struct
+{
+ H264E_persist_t *enc;
+ int frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type;
+} h264_enc_slice_thread_params_t;
+
+static void encode_slice_thread_simple(void *arg)
+{
+ h264_enc_slice_thread_params_t *h = (h264_enc_slice_thread_params_t*)arg;
+ encode_slice(h->enc, h->frame_type, h->long_term_idx_use, h->long_term_idx_update, h->pps_id, h->enc_type);
+}
+#endif
+
+static int H264E_encode_one(H264E_persist_t *enc, const H264E_run_param_t *opt,
+ int long_term_idx_use, int is_refers_to_long_term, int long_term_idx_update,
+ int frame_type, int pps_id, int enc_type)
+{
+ int i, k;
+ // slice reset
+ enc->slice.type = (long_term_idx_use < 0 ? SLICE_TYPE_I : SLICE_TYPE_P);
+ rc_frame_start(enc, (long_term_idx_use < 0) ? 1 : 0, is_refers_to_long_term);
+
+ enc->mb.x = enc->mb.y = enc->mb.num = 0;
+
+ if (long_term_idx_use > 0)
+ {
+ // Activate long-term reference buffer
+ for (i = 0; i < 3; i++)
+ {
+ SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_use - 1][i]);
+ }
+ }
+
+ if (enc->param.vbv_size_bytes && !long_term_idx_use && long_term_idx_update <= 0 &&
+ enc->rc.vbv_bits - enc->run_param.desired_frame_bytes*8 > enc->param.vbv_size_bytes*8)
+ {
+ // encode transparent frame on VBV overflow
+ encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id,enc_type);
+ enc->mb.skip_run = enc->frame.nmb;
+ UE(enc->mb.skip_run);
+ nal_end(enc);
+ for (i = 0, k = 16; i < 3; i++, k = 8)
+ {
+ pix_copy_pic(enc->dec.yuv[i], enc->dec.stride[i], enc->ref.yuv[i], enc->ref.stride[i], enc->frame.nmbx*k, enc->frame.nmby*k);
+ }
+ } else
+ {
+#if H264E_MAX_THREADS
+ if (enc->param.max_threads > 1)
+ {
+ H264E_persist_t enc_thr[H264E_MAX_THREADS];
+ int sizeof_scratch = enc_alloc_scratch(NULL, &enc->param, (void*)(uintptr_t)1);
+ unsigned char *scratch_base = ((unsigned char*)enc->scratch) + sizeof_scratch;
+ int mby = 0;
+ int ithr;
+ int nmby = enc->frame.nmby;
+ void *savep[3];
+ for (i = 0; i < 3; i++)
+ {
+ savep[i] = enc->dec.yuv[i];
+ }
+
+ for (ithr = 0; ithr < enc->param.max_threads; ithr++)
+ {
+ enc_thr[ithr] = *enc;
+ enc_thr[ithr].mb.y = mby;
+ enc_thr[ithr].mb.num = mby*enc->frame.nmbx;
+ mby += (enc->frame.nmby - mby) / (enc->param.max_threads - ithr);
+ enc_thr[ithr].frame.nmby = mby;
+ enc_thr[ithr].rc.bit_budget /= enc->param.max_threads;
+ enc_thr[ithr].frame.nmb = enc_thr[ithr].frame.nmbx * enc_thr[ithr].frame.nmby;
+
+ for (i = 0, k = 16; i < 3; i++, k = 8)
+ {
+ enc_thr[ithr].dec.yuv[i] += k*enc->dec.stride[i]*enc_thr[ithr].mb.y;
+ }
+
+ //enc_alloc_scratch(enc_thr + ithr, &enc->param, (unsigned char*)(scratch_thr[ithr]));
+ scratch_base += enc_alloc_scratch(enc_thr + ithr, &enc->param, scratch_base);
+ enc_thr[ithr].out_pos = 0;
+ h264e_bs_init_bits(enc_thr[ithr].bs, enc_thr[ithr].out);
+ }
+
+ {
+ h264_enc_slice_thread_params_t thread_par[H264E_MAX_THREADS];
+ void *args[H264E_MAX_THREADS];
+ for (i = 0; i < enc->param.max_threads; i++)
+ {
+ thread_par[i].enc = enc_thr + i;
+ thread_par[i].frame_type = frame_type;
+ thread_par[i].long_term_idx_use = long_term_idx_use;
+ thread_par[i].long_term_idx_update = long_term_idx_update;
+ thread_par[i].pps_id = pps_id;
+ thread_par[i].enc_type = enc_type;
+ args[i] = thread_par + i;
+ }
+ enc->param.run_func_in_thread(enc->param.token, encode_slice_thread_simple, args, enc->param.max_threads);
+ }
+
+ for (i = 0; i < enc->param.max_threads; i++)
+ {
+ memcpy(enc->out + enc->out_pos, enc_thr[i].out, enc_thr[i].out_pos);
+ enc->out_pos += enc_thr[i].out_pos;
+ }
+ enc->frame.nmby = nmby;
+ for (i = 0; i < 3; i++)
+ {
+ enc->dec.yuv[i] = savep[i];
+ }
+ } else
+#endif
+ {
+ encode_slice(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type);
+ }
+ }
+
+ // Set flags for AMM state machine for standard compliance
+ if (frame_type == H264E_FRAME_TYPE_KEY)
+ {
+ // Reset long-term reference frames
+ memset(enc->lt_used, 0, sizeof(enc->lt_used));
+ // Assume that this frame is not short-term (have effect only if AMM used)
+ enc->short_term_used = 0;
+ }
+ if (long_term_idx_update > 0)
+ {
+ enc->lt_used[long_term_idx_update - 1] = 1;
+ } else if (long_term_idx_update == 0)
+ {
+ enc->short_term_used = 1;
+ }
+
+ rc_frame_end(enc, long_term_idx_use == -1, enc->mb.skip_run == enc->frame.nmb, is_refers_to_long_term);
+
+ if (long_term_idx_use > 0)
+ {
+ // deactivate long-term reference
+ for (i = 0; i < 3; i++)
+ {
+ SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_use - 1][i]);
+ }
+ }
+
+ if (long_term_idx_update != -1)
+ {
+ pix_copy_recon_pic_to_ref(enc);
+
+ if (++enc->frame.num >= enc->param.gop && enc->param.gop && (opt->frame_type == H264E_FRAME_TYPE_DEFAULT))
+ {
+ enc->frame.num = 0; // trigger to encode IDR on next call
+ }
+
+ if (long_term_idx_update > 0)
+ {
+ for (i = 0; i < 3; i++)
+ {
+ SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_update - 1][i]);
+ }
+ }
+ }
+
+ return H264E_STATUS_SUCCESS;
+}
+
+static int check_parameters_align(const H264E_create_param_t *opt, const H264E_io_yuv_t *in)
+{
+ int i;
+ int min_align = 0;
+#if H264E_ENABLE_NEON || H264E_ENABLE_SSE2
+ min_align = 7;
+#endif
+ if (opt->const_input_flag && opt->temporal_denoise_flag)
+ {
+ min_align = 0;
+ }
+ for (i = 0; i < 3; i++)
+ {
+ if (((uintptr_t)in->yuv[i]) & min_align)
+ {
+ return i ? H264E_STATUS_BAD_CHROMA_ALIGN : H264E_STATUS_BAD_LUMA_ALIGN;
+ }
+ if (in->stride[i] & min_align)
+ {
+ return i ? H264E_STATUS_BAD_CHROMA_STRIDE : H264E_STATUS_BAD_LUMA_STRIDE;
+ }
+ }
+ return H264E_STATUS_SUCCESS;
+}
+
+/**
+* Top-level encode function
+* See header file for details.
+*/
+int H264E_encode(H264E_persist_t *enc, H264E_scratch_t *scratch, const H264E_run_param_t *opt,
+ H264E_io_yuv_t *in, unsigned char **coded_data, int *sizeof_coded_data)
+{
+ int i;
+ int frame_type;
+ int long_term_idx_use;
+ int long_term_idx_update;
+ int is_refers_to_long_term;
+ int error;
+
+ error = check_parameters_align(&enc->param, in);
+ if (error)
+ {
+ return error;
+ }
+ (void)i;
+ i = enc_alloc_scratch(enc, &enc->param, (unsigned char*)scratch);
+#if H264E_SVC_API
+ {
+ H264E_persist_t *e = enc->enc_next;
+ while (e)
+ {
+ i += enc_alloc_scratch(e, &enc->param, ((unsigned char*)scratch) + i);
+ e = e->enc_next;
+ }
+ }
+#endif
+
+ enc->inp = *in;
+
+#if H264E_ENABLE_DENOISE
+ // 1. Run optional denoise filter
+ if (enc->param.temporal_denoise_flag && opt->encode_speed < 2)
+ {
+ int sh = 0;
+ for (i = 0; i < 3; i++)
+ {
+ h264e_denoise_run(in->yuv[i], enc->denoise.yuv[i], enc->param.width >> sh, enc->param.height >> sh, in->stride[i], enc->denoise.stride[i]);
+ enc->inp.yuv[i] = enc->denoise.yuv[i];
+ enc->inp.stride[i] = enc->denoise.stride[i];
+ sh = 1;
+ }
+ }
+#endif
+
+ enc->out_pos = 0; // reset output bitbuffer position
+
+ if (opt)
+ {
+ enc->run_param = *opt; // local copy of run-time parameters
+ }
+ opt = &enc->run_param; // refer to local copy
+
+ // silently fix invalid QP without warning
+ if (!enc->run_param.qp_max || enc->run_param.qp_max > 51)
+ {
+ enc->run_param.qp_max = 51;
+ }
+ if (!enc->run_param.qp_min || enc->run_param.qp_min < MIN_QP)
+ {
+ enc->run_param.qp_min = MIN_QP;
+ }
+
+ enc->speed.disable_deblock = (opt->encode_speed == 8 || opt->encode_speed == 10);
+
+ if (!enc->param.const_input_flag)
+ {
+ // if input frame can be re-used as a scratch, set reconstructed frame to the input
+ enc->dec = *in;
+ }
+
+ // Set default frame type
+ frame_type = opt->frame_type;
+ if (frame_type == H264E_FRAME_TYPE_DEFAULT)
+ {
+ frame_type = enc->frame.num ? H264E_FRAME_TYPE_P : H264E_FRAME_TYPE_KEY;
+ }
+ // Estimate long-term indexes from frame type
+ // index 0 means "short-term" reference
+ // index -1 means "not used"
+ switch (frame_type)
+ {
+ default:
+ case H264E_FRAME_TYPE_I: long_term_idx_use = -1; long_term_idx_update = 0; break;
+ case H264E_FRAME_TYPE_KEY: long_term_idx_use = -1; long_term_idx_update = enc->param.max_long_term_reference_frames > 0; break;
+ case H264E_FRAME_TYPE_GOLDEN: long_term_idx_use = 1; long_term_idx_update = 1; break;
+ case H264E_FRAME_TYPE_RECOVERY: long_term_idx_use = 1; long_term_idx_update = 0; break;
+ case H264E_FRAME_TYPE_P: long_term_idx_use = enc->most_recent_ref_frame_idx; long_term_idx_update = 0; break;
+ case H264E_FRAME_TYPE_DROPPABLE:long_term_idx_use = enc->most_recent_ref_frame_idx; long_term_idx_update = -1; break;
+ case H264E_FRAME_TYPE_CUSTOM: long_term_idx_use = opt->long_term_idx_use; long_term_idx_update = opt->long_term_idx_update;
+ if (!long_term_idx_use)
+ {
+ long_term_idx_use = enc->most_recent_ref_frame_idx;
+ }
+ if (long_term_idx_use < 0)
+ {
+ // hack: redefine frame type, always encode IDR
+ frame_type = H264E_FRAME_TYPE_KEY;
+ }
+ break;
+ }
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+ is_refers_to_long_term = (long_term_idx_use != enc->most_recent_ref_frame_idx && long_term_idx_use >= 0);
+#else
+ is_refers_to_long_term = 0;
+#endif
+
+ if (long_term_idx_update >= 0)
+ {
+ enc->most_recent_ref_frame_idx = long_term_idx_update;
+ }
+ if (frame_type == H264E_FRAME_TYPE_KEY)
+ {
+ int pic_init_qp = 30;
+ pic_init_qp = MIN(pic_init_qp, enc->run_param.qp_max);
+ pic_init_qp = MAX(pic_init_qp, enc->run_param.qp_min);
+
+ //temp only two layers!
+ enc->sps.pic_init_qp = pic_init_qp;
+ enc->next_idr_pic_id ^= 1;
+ enc->frame.num = 0;
+
+#if H264E_SVC_API
+ if (enc->param.num_layers > 1)
+ {
+ H264E_persist_t *enc_base = enc->enc_next;
+ enc_base->sps.pic_init_qp = pic_init_qp;
+ enc_base->next_idr_pic_id ^= 1;
+ enc_base->frame.num = 0;
+
+ enc_base->out = enc->out;
+ enc_base->out_pos = 0;
+ encode_sps(enc_base, 66);
+ encode_pps(enc_base, 0);
+
+ enc->out_pos += enc_base->out_pos;
+ encode_sps(enc, 83);
+ encode_pps(enc, 1);
+ } else
+#endif
+ {
+ encode_sps(enc, 66);
+ encode_pps(enc, 0);
+ }
+ } else
+ {
+ if (!enc->sps.pic_init_qp)
+ {
+ return H264E_STATUS_BAD_FRAME_TYPE;
+ }
+ if (long_term_idx_use > enc->param.max_long_term_reference_frames ||
+ long_term_idx_update > enc->param.max_long_term_reference_frames ||
+ long_term_idx_use > MAX_LONG_TERM_FRAMES)
+ {
+ return H264E_STATUS_BAD_FRAME_TYPE;
+ }
+ }
+
+#if H264E_SVC_API
+ if (enc->param.num_layers > 1)
+ {
+ H264E_persist_t *enc_base = enc->enc_next;
+ int sh = 0;
+
+ enc_base->run_param = enc->run_param;
+ enc_base->run_param.desired_frame_bytes = enc->run_param.desired_frame_bytes >> 2;
+
+ for (i = 0; i < 3; i++)
+ {
+ h264e_frame_downsampling(enc_base->inp.yuv[i], enc_base->inp.stride[i], enc_base->frame.h >> sh,
+ in->yuv[i], in->stride[i], enc->param.height >> sh, enc_base->param.width >> sh,
+ enc_base->param.height >> sh, enc->param.width >> sh, enc->param.height >> sh);
+ sh = 1;
+ }
+
+ enc_base->scratch = enc->scratch;
+ enc_base->out = enc->out + enc->out_pos;
+ enc_base->out_pos = 0;
+
+ H264E_encode_one(enc_base, &enc_base->run_param, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+ frame_type, enc->param.sps_id*4 + 0, 0);
+
+ enc->out_pos += enc_base->out_pos;
+
+ if ((frame_type == H264E_FRAME_TYPE_I || frame_type == H264E_FRAME_TYPE_KEY) && enc->param.inter_layer_pred_flag)
+ {
+ for (i = 0, sh = 0; i < 3; i++, sh = 1)
+ {
+ h264e_intra_upsampling(enc_base->frame.w >> sh, enc_base->frame.h >> sh, enc->frame.w >> sh, enc->frame.h >> sh,
+ sh, enc_base->dec.yuv[i], enc_base->dec.stride[i], enc->ref.yuv[i], enc->ref.stride[i]);
+ }
+ }
+
+ memset(enc->df.df_nzflag, 0, enc->frame.nmbx);
+ H264E_encode_one(enc, opt, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+ frame_type, enc->param.sps_id*4 + 1, 20);
+ } else
+#endif // H264E_SVC_API
+ {
+ H264E_encode_one(enc, opt, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+ frame_type, enc->param.sps_id*4 + 0, 0);
+ }
+
+ *sizeof_coded_data = enc->out_pos;
+ *coded_data = enc->out;
+ return H264E_STATUS_SUCCESS;
+}
+
+/**
+* Return persistent and scratch memory requirements
+* for given encoding options.
+* See header file for details.
+*/
+int H264E_sizeof(const H264E_create_param_t *par, int *sizeof_persist, int *sizeof_scratch)
+{
+ int i;
+ int error = H264E_sizeof_one(par, sizeof_persist, sizeof_scratch, 0);
+ (void)i;
+#if H264E_SVC_API
+ for (i = par->num_layers; i > 1; i--)
+ {
+ H264E_create_param_t opt_next = *par;
+ opt_next.const_input_flag = 1;
+ opt_next.temporal_denoise_flag = 0;
+ opt_next.width = opt_next.width >> 1;
+ opt_next.width += opt_next.width & 1;
+ opt_next.height = opt_next.height >> 1;
+ opt_next.height += opt_next.height & 1;
+ *sizeof_persist += enc_alloc(NULL, par, (void*)(uintptr_t)1, 1) + sizeof(h264e_enc_t);
+#if H264E_MAX_THREADS > 1
+ *sizeof_scratch += enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1) * (H264E_MAX_THREADS + 1);
+#else
+ *sizeof_scratch += enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1);
+#endif
+ }
+#endif
+ return error;
+}
+
+/**
+* Set VBV size and fullness
+* See header file for details.
+*/
+void H264E_set_vbv_state(
+ H264E_persist_t *enc,
+ int vbv_size_bytes, //< New VBV size
+ int vbv_fullness_bytes //< New VBV fulness, -1 = no change
+)
+{
+ if (enc)
+ {
+ enc->param.vbv_size_bytes = vbv_size_bytes;
+ if (vbv_fullness_bytes >= 0)
+ {
+ enc->rc.vbv_bits = vbv_fullness_bytes*8;
+ enc->rc.vbv_target_level = enc->rc.vbv_bits;
+ }
+ }
+}
+#endif
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,17 @@
+</$objtype/mkfile
+
+CFLAGS=$CFLAGS -p -I/sys/include/npe -D__plan9__
+BIN=/$objtype/bin/video
+TARG=hj264
+
+HFILES=\
+ stb_truetype.h\
+
+UPDATE=$HFILES
+
+OFILES=\
+ hj264.$O\
+
+default:V: all
+
+</sys/src/cmd/mkone