From ecc9bfab548f464d4c2be899055f7ba567c1ed8e Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sun, 14 Sep 2008 21:36:45 -0700
Subject: [PATCH] Psychovisually optimized rate-distortion optimization and
 trellis The latter, psy-trellis, is disabled by default and is reserved as
 experimental; your mileage may vary. Default subme is raised to 6 so that psy
 RD is on by default.

---
 common/common.c      |  20 +++++++-
 common/common.h      |  12 +++++
 common/dct.h         |  11 +++++
 encoder/analyse.c    |  74 +++++++++++++++++++++++++++---
 encoder/encoder.c    |  21 +++++++++
 encoder/macroblock.c |  22 ++++-----
 encoder/macroblock.h |   4 +-
 encoder/rdo.c        | 106 +++++++++++++++++++++++++++++++++++--------
 x264.c               |   5 ++
 x264.h               |   4 +-
 10 files changed, 240 insertions(+), 39 deletions(-)

diff --git a/common/common.c b/common/common.c
index 138c5f3b..9d84bfaa 100644
--- a/common/common.c
+++ b/common/common.c
@@ -116,8 +116,10 @@ void    x264_param_default( x264_param_t *param )
                          | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
     param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     param->analyse.i_me_method = X264_ME_HEX;
+    param->analyse.f_psy_rd = 1.0;
+    param->analyse.f_psy_trellis = 0;
     param->analyse.i_me_range = 16;
-    param->analyse.i_subpel_refine = 5;
+    param->analyse.i_subpel_refine = 6;
     param->analyse.b_chroma_me = 1;
     param->analyse.i_mv_range_thread = -1;
     param->analyse.i_mv_range = -1; // set from level_idc
@@ -470,6 +472,21 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->analyse.i_mv_range_thread = atoi(value);
     OPT2("subme", "subq")
         p->analyse.i_subpel_refine = atoi(value);
+    OPT("psy-rd")
+    {
+        if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
+            2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
+        { }
+        else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
+        {
+            p->analyse.f_psy_trellis = 0;
+        }
+        else
+        {
+            p->analyse.f_psy_rd = 0;
+            p->analyse.f_psy_trellis = 0;
+        }
+    }
     OPT("bime")
         p->analyse.b_bidir_me = atobool(value);
     OPT("chroma-me")
@@ -824,6 +841,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
     s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
     s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
+    s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
     s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );
     s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
     s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
diff --git a/common/common.h b/common/common.h
index 90c7cdb4..37b0b205 100644
--- a/common/common.h
+++ b/common/common.h
@@ -381,6 +381,8 @@ struct x264_t
         int     b_chroma_me;
         int     b_trellis;
         int     b_noise_reduction;
+        int     i_psy_rd; /* Psy RD strength--fixed point value*/
+        int     i_psy_trellis; /* Psy trellis strength--fixed point value*/
 
         int     b_interlaced;
 
@@ -462,6 +464,16 @@ struct x264_t
             DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
             DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
 
+            /* Psy trellis DCT data */
+            DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
+            DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
+
+            /* Psy RD SATD scores */
+            int fenc_satd[4][4];
+            int fenc_satd_sum;
+            int fenc_sa8d[2][2];
+            int fenc_sa8d_sum;
+
             /* pointer over mb of the frame to be compressed */
             uint8_t *p_fenc[3];
 
diff --git a/common/dct.h b/common/dct.h
index 1078023d..daa96f4a 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -41,6 +41,17 @@ static const uint16_t x264_dct8_weight_tab[64] = {
 };
 #undef W
 
+#define W(i) (i==0 ? FIX8(1.76777) :\
+              i==1 ? FIX8(1.11803) :\
+              i==2 ? FIX8(0.70711) :0)
+static const uint16_t x264_dct4_weight_tab[16] = {
+    W(0), W(1), W(0), W(1),
+    W(1), W(2), W(1), W(2),
+    W(0), W(1), W(0), W(1),
+    W(1), W(2), W(1), W(2)
+};
+#undef W
+
 /* inverse squared */
 #define W(i) (i==0 ? FIX8(3.125) :\
               i==1 ? FIX8(1.25) :\
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 10399690..8f7b4fa9 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -467,6 +467,58 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
     }
 }
 
+/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
+static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
+{
+    DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
+    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+    DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0};
+    int i;
+
+    if( do_both_dct || h->mb.b_transform_8x8 )
+    {
+        h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
+        for( i = 0; i < 4; i++ )
+            h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
+    }
+    if( do_both_dct || !h->mb.b_transform_8x8)
+    {
+        h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
+        for( i = 0; i < 16; i++ )
+            h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
+    }
+}
+
+/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
+static inline void x264_mb_cache_fenc_satd( x264_t *h )
+{
+    DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
+    uint8_t *fenc;
+    int x, y, satd_sum = 0, sa8d_sum = 0;
+    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
+        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
+    if( !h->mb.i_psy_rd )
+        return;
+    for( y = 0; y < 4; y++ )
+        for( x = 0; x < 4; x++ )
+        {
+            fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
+            h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
+                                      - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
+            satd_sum += h->mb.pic.fenc_satd[y][x];
+        }
+    for( y = 0; y < 2; y++ )
+        for( x = 0; x < 2; x++ )
+        {
+            fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
+            h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
+                                      - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
+            sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
+        }
+    h->mb.pic.fenc_satd_sum = satd_sum;
+    h->mb.pic.fenc_sa8d_sum = sa8d_sum;
+}
+
 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 {
     int i;
@@ -1017,12 +1069,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
 
     h->mb.i_type = P_L0;
-    if( a->b_mbrd && a->l0.me16x16.i_ref == 0
-        && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+    if( a->b_mbrd )
     {
-        h->mb.i_partition = D_16x16;
-        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
-        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+        x264_mb_cache_fenc_satd( h );
+        if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+        {
+            h->mb.i_partition = D_16x16;
+            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
+            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+        }
     }
 }
 
@@ -1907,7 +1962,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
 
 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 {
-    int thresh = i_satd_inter * 17/16;
+    int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
 
     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
     {
@@ -2066,6 +2121,8 @@ void x264_macroblock_analyse( x264_t *h )
     /*--------------------------- Do the analysis ---------------------------*/
     if( h->sh.i_type == SLICE_TYPE_I )
     {
+        if( analysis.b_mbrd )
+            x264_mb_cache_fenc_satd( h );
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
         if( analysis.b_mbrd )
             x264_intra_rd( h, &analysis, COST_MAX );
@@ -2344,6 +2401,9 @@ void x264_macroblock_analyse( x264_t *h )
         int i_bskip_cost = COST_MAX;
         int b_skip = 0;
 
+        if( analysis.b_mbrd )
+            x264_mb_cache_fenc_satd( h );
+
         h->mb.i_type = B_SKIP;
         if( h->mb.b_direct_auto_write )
         {
@@ -2589,6 +2649,8 @@ void x264_macroblock_analyse( x264_t *h )
 
     h->mb.b_trellis = h->param.analyse.i_trellis;
     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
+    if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
+        x264_psy_trellis_init( h, 0 );
     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
         h->mb.i_skip_intra = 0;
 }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index e918352f..5a8869d4 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -410,6 +410,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_trellis = 0;
         h->param.analyse.b_fast_pskip = 0;
         h->param.analyse.i_noise_reduction = 0;
+        h->param.analyse.f_psy_rd = 0;
     }
     if( h->param.rc.i_rc_method == X264_RC_CQP )
     {
@@ -488,6 +489,26 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.b_cabac )
         h->param.analyse.i_trellis = 0;
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
+    if( !h->param.analyse.i_trellis )
+        h->param.analyse.f_psy_trellis = 0;
+    h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
+    h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 );
+    if( h->param.analyse.i_subpel_refine < 6 )
+        h->param.analyse.f_psy_rd = 0;
+    h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd );
+    /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */
+    /* so we lower the chroma QP offset to compensate */
+    /* This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding
+     * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. */
+    if( h->mb.i_psy_rd )
+        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2;
+    h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 );
+    /* Psy trellis has a similar effect. */
+    if( h->mb.i_psy_trellis )
+        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
+    else
+        h->mb.i_psy_trellis = 0;
+    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
     h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 );
     if( h->param.rc.f_aq_strength <= 0 )
         h->param.rc.i_aq_mode = 0;
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index a353ce71..e877ccbb 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -79,20 +79,20 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
     return i_score;
 }
 
-static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra )
+static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
     if( h->mb.b_trellis )
-        x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra );
+        x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
     else
         h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 }
 
-static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra )
+static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
     if( h->mb.b_trellis )
-        x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra );
+        x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
     else
         h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
 }
@@ -111,7 +111,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
 
     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
 
-    x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1 );
+    x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
 
     if( array_non_zero( dct4x4 ) )
     {
@@ -135,7 +135,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
 
     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 
-    x264_quant_8x8( h, dct8x8, i_qp, 1 );
+    x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
 
     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
@@ -174,7 +174,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         dct4x4[i][0][0] = 0;
 
         /* quant/scan/dequant */
-        x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1 );
+        x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
 
         h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
         h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
@@ -453,7 +453,7 @@ void x264_macroblock_encode( x264_t *h )
             {
                 if( h->mb.b_noise_reduction )
                     h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
-                x264_quant_8x8( h, dct8x8[idx], i_qp, 0 );
+                x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
 
                 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
 
@@ -498,7 +498,7 @@ void x264_macroblock_encode( x264_t *h )
 
                     if( h->mb.b_noise_reduction )
                         h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-                    x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0 );
+                    x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
 
                     h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
 
@@ -777,7 +777,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         {
             DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-            x264_quant_8x8( h, dct8x8, i_qp, 0 );
+            x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
             h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
 
             if( b_decimate && !h->mb.b_trellis )
@@ -797,7 +797,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
             for( i4 = 0; i4 < 4; i4++ )
-                x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0 );
+                x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
 
             for( i4 = 0; i4 < 4; i4++ )
                 h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index e2b9d318..7035aa21 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -50,9 +50,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
-                             int i_qp, int i_ctxBlockCat, int b_intra );
+                             int i_qp, int i_ctxBlockCat, int b_intra, int idx );
 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
-                             int i_qp, int b_intra );
+                             int i_qp, int b_intra, int idx );
 
 void x264_noise_reduction_update( x264_t *h );
 
diff --git a/encoder/rdo.c b/encoder/rdo.c
index a8a2386e..f169e9a4 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -51,20 +51,78 @@ static uint16_t cabac_prefix_size[15][128];
 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
 
-static int ssd_mb( x264_t *h )
+
+#define ADD_ABS_SATD(satdtype, pixel)\
+    satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\
+          - sum_##satdtype( h, pixel, x, y ));
+
+/* Sum the cached SATDs to avoid repeating them. */
+static inline int sum_satd( x264_t *h, int pixel, int x, int y )
+{
+    int satd = 0;
+    int min_x = x>>2;
+    int min_y = y>>2;
+    int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
+    int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
+    if( pixel == PIXEL_16x16 )
+        return h->mb.pic.fenc_satd_sum;
+    for( y = min_y; y < max_y; y++ )
+        for( x = min_x; x < max_x; x++ )
+            satd += h->mb.pic.fenc_satd[y][x];
+    return satd;
+}
+
+static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
+{
+    int sa8d = 0;
+    int min_x = x>>3;
+    int min_y = y>>3;
+    int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
+    int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
+    if( pixel == PIXEL_16x16 )
+        return h->mb.pic.fenc_sa8d_sum;
+    for( y = min_y; y < max_y; y++ )
+        for( x = min_x; x < max_x; x++ )
+            sa8d += h->mb.pic.fenc_sa8d[y][x];
+    return sa8d;
+}
+
+/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
+/* SATD and SA8D are used to measure block complexity. */
+/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
+/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
+
+/* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
+/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
+/* This optimization can also be used in non-RD transform decision. */
+
+static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 {
-    return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[0], FDEC_STRIDE )
-         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[1], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[1], FDEC_STRIDE )
-         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[2], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[2], FDEC_STRIDE );
+    DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
+    int satd = 0;
+    uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
+    uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
+    if( p == 0 && h->mb.i_psy_rd )
+    {
+        int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1;
+        ADD_ABS_SATD(satd, size);
+        /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
+        if(size <= PIXEL_8x8)
+        {
+            dc_coef >>= 1;
+            ADD_ABS_SATD(sa8d, size);
+            satd >>= 1;
+        }
+        satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
+    }
+    return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
 }
 
-static int ssd_plane( x264_t *h, int size, int p, int x, int y )
+static inline int ssd_mb( x264_t *h )
 {
-    return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
-                              h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
+    return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
+         + ssd_plane(h, PIXEL_8x8,   1, 0, 0)
+         + ssd_plane(h, PIXEL_8x8,   2, 0, 0);
 }
 
 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
@@ -269,7 +327,7 @@ static const int lambda2_tab[2][52] = {
 };
 
 typedef struct {
-    uint64_t score;
+    int64_t score;
     int level_idx; // index into level_tree[]
     uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
 } trellis_node_t;
@@ -298,7 +356,7 @@ typedef struct {
 static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
                                  const uint16_t *quant_mf, const int *unquant_mf,
                                  const int *coef_weight, const uint8_t *zigzag,
-                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )
+                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx )
 {
     int abs_coefs[64], signs[64];
     trellis_node_t nodes[2][8];
@@ -430,8 +488,20 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
         // that are better left coded, especially at QP > 40.
         for( abs_level = q; abs_level >= q-1; abs_level-- )
         {
-            int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
-            uint64_t ssd = (int64_t)d*d * coef_weight[i];
+            int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
+            int d = i_coef - unquant_abs_level;
+            int64_t ssd;
+            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
+            if( h->mb.i_psy_trellis && i )
+            {
+                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
+                int predicted_coef = orig_coef - i_coef * signs[i];
+                int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
+                int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
+                ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
+            }
+            else
+                ssd = (int64_t)d*d * coef_weight[i];
 
             for( j = 0; j < 8; j++ )
             {
@@ -495,24 +565,24 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
 
 
 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
-                             int i_qp, int i_ctxBlockCat, int b_intra )
+                             int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
     quant_trellis_cabac( h, (int16_t*)dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );
+        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx );
 }
 
 
 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
-                             int i_qp, int b_intra )
+                             int i_qp, int b_intra, int idx )
 {
     quant_trellis_cabac( h, (int16_t*)dct,
         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan8[h->mb.b_interlaced],
-        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );
+        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx );
 }
 
diff --git a/x264.c b/x264.c
index e8e9f435..b6341d82 100644
--- a/x264.c
+++ b/x264.c
@@ -251,6 +251,10 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
         "                                  decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
     H0( "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6.\n" );
+    H0( "      --psy-rd                Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
+        "                                  #1: RDO (requires subme>=6)\n"
+        "                                  #2: Trellis (requires trellis, experimental)\n",
+                                       defaults->analyse.f_psy_rd,defaults->analyse.f_psy_trellis );
     H0( "      --mixed-refs            Decide references on a per partition basis\n" );
     H1( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
     H1( "      --bime                  Jointly optimize both MVs in B-frames\n" );
@@ -420,6 +424,7 @@ static int  Parse( int argc, char **argv,
             { "mvrange", required_argument, NULL, 0 },
             { "mvrange-thread", required_argument, NULL, 0 },
             { "subme",   required_argument, NULL, 'm' },
+            { "psy-rd",   required_argument, NULL, 0 },
             { "b-rdo",   no_argument,       NULL, 0 },
             { "mixed-refs", no_argument,    NULL, 0 },
             { "no-chroma-me", no_argument,  NULL, 0 },
diff --git a/x264.h b/x264.h
index 839645bc..4d81bced 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 63
+#define X264_BUILD 64
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -241,6 +241,8 @@ typedef struct x264_param_t
         int          b_fast_pskip; /* early SKIP detection on P-frames */
         int          b_dct_decimate; /* transform coefficient thresholding on P-frames */
         int          i_noise_reduction; /* adaptive pseudo-deadzone */
+        float        f_psy_rd; /* Psy RD strength */
+        float        f_psy_trellis; /* Psy trellis strength */
 
         /* the deadzone size that will be used in luma quantization */
         int          i_luma_deadzone[2]; /* {inter, intra} */
-- 
2.40.0