From 81c6041922266cc20cf2507365cfee3335eb83d4 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Fri, 5 Oct 2012 11:16:46 +0100
Subject: [PATCH] Fix SIMD unsafe use of floating point.

This commit fixes unsafe simd / floating point interactions arising
from the current hybrid and 16x16 transform implementation.
These led to a raft of bugs and issues when the project was
built using VS2008 for Win32 though they did not show up with
the unix builds.

Gerrit makes a meal out of presenting the fix but all I have actually
done is indent the body of each function that uses floating point by
one level and bracket with emms instructions using  the function
vp8_clear_system_state(). See below.

function () {
  vp8_clear_system_state();
  {
  ... function body
  }
  vp8_clear_system_state();
}

This is almost certainly over the top in terms of number of emms
instructions but is a temporary measure pending implementation of
integer variants of each function to replace the floating point.

Limited testing suggests that this fixes the problems that arose for
Win32 VS2008 when the hybrid or 16x16 transforms were enabled.

Change-Id: I7c9a72bd79315246ed880578dec51e2b7c178442
---
 vp8/common/idctllm.c | 661 ++++++++++++++++++++++---------------------
 vp8/encoder/dct.c    | 484 ++++++++++++++++---------------
 2 files changed, 593 insertions(+), 552 deletions(-)

diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index b4475c628..5eb9d0c8a 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -24,6 +24,7 @@
  **************************************************************************/
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_HYBRIDTRANSFORM
 #include "vp8/common/blockd.h"
@@ -166,92 +167,71 @@ float iadst_16[256] = {
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 void vp8_ihtllm_c(short *input, short *output, int pitch,
                   TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in
-                            // conjunction with integer transform
-
-                            // further notice, since we are thinking to use one
-                            // function for both 4x4 and 8x8 transforms, the
-                            // temporary buffers are simply initialized with 64.
-  short *ip = input;
-  short *op = output;
-  int shortpitch = pitch >> 1;
 
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += tx_dim;
-    ip  += tx_dim;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &iadst_4[0] :
-                            ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-      break;
-
-    default :
-      ptv = (tx_dim == 4) ? &idct_4[0] :
-                            ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-      break;
-  }
-
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfb[i] = 0 ;
-      for(k = 0; k < tx_dim; k++) {
-        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+                                // the implementation could be simplified in
+                                // conjunction with integer transform
+
+                                // further notice, since we are thinking to use
+                                // one function for both 4x4 and 8x8 transforms
+                                // the temporary buffers are simply initialized
+                                // with 64.
+    short *ip = input;
+    short *op = output;
+    int shortpitch = pitch >> 1;
+
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
+
+    // pointers to vertical and horizontal transforms
+    float *ptv, *pth;
+
+    // load and convert residual array into floating-point
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
       }
-      pfa += 1;
+      pfa += tx_dim;
+      ip  += tx_dim;
     }
 
-    pfb += tx_dim;
-    ptv += tx_dim;
+    // vertical transformation
     pfa = &bufa[0];
-  }
+    pfb = &bufb[0];
 
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = (tx_dim == 4) ? &iadst_4[0] :
-                            ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-      break;
-
-    default :
-      pth = (tx_dim == 4) ? &idct_4[0] :
-                            ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-      break;
-  }
+    switch(tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &iadst_4[0] :
+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+        break;
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfa[i] += pfb[k] * pth[k];
+      default :
+        ptv = (tx_dim == 4) ? &idct_4[0] :
+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+        break;
+    }
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfb[i] = 0 ;
+        for(k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
       }
-      pth += tx_dim;
-     }
 
-    pfa += tx_dim;
-    pfb += tx_dim;
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
+    }
+
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
 
     switch(tx_type) {
       case ADST_ADST :
@@ -265,21 +245,48 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
                               ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
         break;
     }
-  }
-
-  // convert to short integer format and load BLOCKD buffer
-  op  = output;
-  pfa = &bufa[0];
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-                             -(short)( - pfa[i] / 8 + 0.49);
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+       }
+
+      pfa += tx_dim;
+      pfb += tx_dim;
+
+      switch(tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &iadst_4[0] :
+                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &idct_4[0] :
+                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+          break;
+      }
     }
 
-    op  += shortpitch;
-    pfa += tx_dim;
+    // convert to short integer format and load BLOCKD buffer
+    op  = output;
+    pfa = &bufa[0];
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
+                               -(short)( - pfa[i] / 8 + 0.49);
+      }
+
+      op  += shortpitch;
+      pfa += tx_dim;
+    }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -776,25 +783,30 @@ void vp8_short_ihaar2x2_c(short *input, short *output, int pitch) {
 #if 0
 // Keep a really bad float version as reference for now.
 void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
-  double x;
-  const int short_pitch = pitch >> 1;
-  int i, j, k, l;
-  for (l = 0; l < 16; ++l) {
-    for (k = 0; k < 16; ++k) {
-      double s = 0;
-      for (i = 0; i < 16; ++i) {
-        for (j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double x;
+    const int short_pitch = pitch >> 1;
+    int i, j, k, l;
+    for (l = 0; l < 16; ++l) {
+      for (k = 0; k < 16; ++k) {
+        double s = 0;
+        for (i = 0; i < 16; ++i) {
+          for (j = 0; j < 16; ++j) {
+            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
+            if (i != 0)
+              x *= sqrt(2.0);
+            if (j != 0)
+              x *= sqrt(2.0);
+            s += x;
+          }
         }
+        output[k*short_pitch+l] = (short)round(s);
       }
-      output[k*short_pitch+l] = (short)round(s);
     }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -816,231 +828,246 @@ static const double C15 = 0.098017140329561;
 
 
 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
-
-
-  // step 1 and 2
-  step[ 0] = input[0] + input[8];
-  step[ 1] = input[0] - input[8];
-
-  temp1 = input[4]*C12;
-  temp2 = input[12]*C4;
-
-  temp1 -= temp2;
-  temp1 *= C8;
-
-  step[ 2] = 2*(temp1);
-
-  temp1 = input[4]*C4;
-  temp2 = input[12]*C12;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  step[ 3] = 2*(temp1);
-
-  temp1 = input[2]*C8;
-  temp1 = 2*(temp1);
-  temp2 = input[6] + input[10];
-
-  step[ 4] = temp1 + temp2;
-  step[ 5] = temp1 - temp2;
-
-  temp1 = input[14]*C8;
-  temp1 = 2*(temp1);
-  temp2 = input[6] - input[10];
-
-  step[ 6] = temp2 - temp1;
-  step[ 7] = temp2 + temp1;
-
-  // for odd input
-  temp1 = input[3]*C12;
-  temp2 = input[13]*C4;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  intermediate[ 8] = 2*(temp1);
-
-  temp1 = input[3]*C4;
-  temp2 = input[13]*C12;
-  temp2 -= temp1;
-  temp2 = (temp2);
-  temp2 *= C8;
-  intermediate[ 9] = 2*(temp2);
-
-  intermediate[10] = 2*(input[9]*C8);
-  intermediate[11] = input[15] - input[1];
-  intermediate[12] = input[15] + input[1];
-  intermediate[13] = 2*((input[7]*C8));
-
-  temp1 = input[11]*C12;
-  temp2 = input[5]*C4;
-  temp2 -= temp1;
-  temp2 = (temp2);
-  temp2 *= C8;
-  intermediate[14] = 2*(temp2);
-
-  temp1 = input[11]*C4;
-  temp2 = input[5]*C12;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  intermediate[15] = 2*(temp1);
-
-  step[ 8] = intermediate[ 8] + intermediate[14];
-  step[ 9] = intermediate[ 9] + intermediate[15];
-  step[10] = intermediate[10] + intermediate[11];
-  step[11] = intermediate[10] - intermediate[11];
-  step[12] = intermediate[12] + intermediate[13];
-  step[13] = intermediate[12] - intermediate[13];
-  step[14] = intermediate[ 8] - intermediate[14];
-  step[15] = intermediate[ 9] - intermediate[15];
-
-  // step 3
-  output[0] = step[ 0] + step[ 3];
-  output[1] = step[ 1] + step[ 2];
-  output[2] = step[ 1] - step[ 2];
-  output[3] = step[ 0] - step[ 3];
-
-  temp1 = step[ 4]*C14;
-  temp2 = step[ 7]*C2;
-  temp1 -= temp2;
-  output[4] =  (temp1);
-
-  temp1 = step[ 4]*C2;
-  temp2 = step[ 7]*C14;
-  temp1 += temp2;
-  output[7] =  (temp1);
-
-  temp1 = step[ 5]*C10;
-  temp2 = step[ 6]*C6;
-  temp1 -= temp2;
-  output[5] =  (temp1);
-
-  temp1 = step[ 5]*C6;
-  temp2 = step[ 6]*C10;
-  temp1 += temp2;
-  output[6] =  (temp1);
-
-  output[8] = step[ 8] + step[11];
-  output[9] = step[ 9] + step[10];
-  output[10] = step[ 9] - step[10];
-  output[11] = step[ 8] - step[11];
-  output[12] = step[12] + step[15];
-  output[13] = step[13] + step[14];
-  output[14] = step[13] - step[14];
-  output[15] = step[12] - step[15];
-
-  // output 4
-  step[ 0] = output[0] + output[7];
-  step[ 1] = output[1] + output[6];
-  step[ 2] = output[2] + output[5];
-  step[ 3] = output[3] + output[4];
-  step[ 4] = output[3] - output[4];
-  step[ 5] = output[2] - output[5];
-  step[ 6] = output[1] - output[6];
-  step[ 7] = output[0] - output[7];
-
-  temp1 = output[8]*C7;
-  temp2 = output[15]*C9;
-  temp1 -= temp2;
-  step[ 8] = (temp1);
-
-  temp1 = output[9]*C11;
-  temp2 = output[14]*C5;
-  temp1 += temp2;
-  step[ 9] = (temp1);
-
-  temp1 = output[10]*C3;
-  temp2 = output[13]*C13;
-  temp1 -= temp2;
-  step[10] = (temp1);
-
-  temp1 = output[11]*C15;
-  temp2 = output[12]*C1;
-  temp1 += temp2;
-  step[11] = (temp1);
-
-  temp1 = output[11]*C1;
-  temp2 = output[12]*C15;
-  temp2 -= temp1;
-  step[12] = (temp2);
-
-  temp1 = output[10]*C13;
-  temp2 = output[13]*C3;
-  temp1 += temp2;
-  step[13] = (temp1);
-
-  temp1 = output[9]*C5;
-  temp2 = output[14]*C11;
-  temp2 -= temp1;
-  step[14] = (temp2);
-
-  temp1 = output[8]*C9;
-  temp2 = output[15]*C7;
-  temp1 += temp2;
-  step[15] = (temp1);
-
-  // step 5
-  output[0] = (step[0] + step[15]);
-  output[1] = (step[1] + step[14]);
-  output[2] = (step[2] + step[13]);
-  output[3] = (step[3] + step[12]);
-  output[4] = (step[4] + step[11]);
-  output[5] = (step[5] + step[10]);
-  output[6] = (step[6] + step[ 9]);
-  output[7] = (step[7] + step[ 8]);
-
-  output[15] = (step[0] - step[15]);
-  output[14] = (step[1] - step[14]);
-  output[13] = (step[2] - step[13]);
-  output[12] = (step[3] - step[12]);
-  output[11] = (step[4] - step[11]);
-  output[10] = (step[5] - step[10]);
-  output[9] = (step[6] - step[ 9]);
-  output[8] = (step[7] - step[ 8]);
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+
+    // step 1 and 2
+    step[ 0] = input[0] + input[8];
+    step[ 1] = input[0] - input[8];
+
+    temp1 = input[4]*C12;
+    temp2 = input[12]*C4;
+
+    temp1 -= temp2;
+    temp1 *= C8;
+
+    step[ 2] = 2*(temp1);
+
+    temp1 = input[4]*C4;
+    temp2 = input[12]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    step[ 3] = 2*(temp1);
+
+    temp1 = input[2]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] + input[10];
+
+    step[ 4] = temp1 + temp2;
+    step[ 5] = temp1 - temp2;
+
+    temp1 = input[14]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] - input[10];
+
+    step[ 6] = temp2 - temp1;
+    step[ 7] = temp2 + temp1;
+
+    // for odd input
+    temp1 = input[3]*C12;
+    temp2 = input[13]*C4;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[ 8] = 2*(temp1);
+
+    temp1 = input[3]*C4;
+    temp2 = input[13]*C12;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[ 9] = 2*(temp2);
+
+    intermediate[10] = 2*(input[9]*C8);
+    intermediate[11] = input[15] - input[1];
+    intermediate[12] = input[15] + input[1];
+    intermediate[13] = 2*((input[7]*C8));
+
+    temp1 = input[11]*C12;
+    temp2 = input[5]*C4;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[14] = 2*(temp2);
+
+    temp1 = input[11]*C4;
+    temp2 = input[5]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[15] = 2*(temp1);
+
+    step[ 8] = intermediate[ 8] + intermediate[14];
+    step[ 9] = intermediate[ 9] + intermediate[15];
+    step[10] = intermediate[10] + intermediate[11];
+    step[11] = intermediate[10] - intermediate[11];
+    step[12] = intermediate[12] + intermediate[13];
+    step[13] = intermediate[12] - intermediate[13];
+    step[14] = intermediate[ 8] - intermediate[14];
+    step[15] = intermediate[ 9] - intermediate[15];
+
+    // step 3
+    output[0] = step[ 0] + step[ 3];
+    output[1] = step[ 1] + step[ 2];
+    output[2] = step[ 1] - step[ 2];
+    output[3] = step[ 0] - step[ 3];
+
+    temp1 = step[ 4]*C14;
+    temp2 = step[ 7]*C2;
+    temp1 -= temp2;
+    output[4] =  (temp1);
+
+    temp1 = step[ 4]*C2;
+    temp2 = step[ 7]*C14;
+    temp1 += temp2;
+    output[7] =  (temp1);
+
+    temp1 = step[ 5]*C10;
+    temp2 = step[ 6]*C6;
+    temp1 -= temp2;
+    output[5] =  (temp1);
+
+    temp1 = step[ 5]*C6;
+    temp2 = step[ 6]*C10;
+    temp1 += temp2;
+    output[6] =  (temp1);
+
+    output[8] = step[ 8] + step[11];
+    output[9] = step[ 9] + step[10];
+    output[10] = step[ 9] - step[10];
+    output[11] = step[ 8] - step[11];
+    output[12] = step[12] + step[15];
+    output[13] = step[13] + step[14];
+    output[14] = step[13] - step[14];
+    output[15] = step[12] - step[15];
+
+    // output 4
+    step[ 0] = output[0] + output[7];
+    step[ 1] = output[1] + output[6];
+    step[ 2] = output[2] + output[5];
+    step[ 3] = output[3] + output[4];
+    step[ 4] = output[3] - output[4];
+    step[ 5] = output[2] - output[5];
+    step[ 6] = output[1] - output[6];
+    step[ 7] = output[0] - output[7];
+
+    temp1 = output[8]*C7;
+    temp2 = output[15]*C9;
+    temp1 -= temp2;
+    step[ 8] = (temp1);
+
+    temp1 = output[9]*C11;
+    temp2 = output[14]*C5;
+    temp1 += temp2;
+    step[ 9] = (temp1);
+
+    temp1 = output[10]*C3;
+    temp2 = output[13]*C13;
+    temp1 -= temp2;
+    step[10] = (temp1);
+
+    temp1 = output[11]*C15;
+    temp2 = output[12]*C1;
+    temp1 += temp2;
+    step[11] = (temp1);
+
+    temp1 = output[11]*C1;
+    temp2 = output[12]*C15;
+    temp2 -= temp1;
+    step[12] = (temp2);
+
+    temp1 = output[10]*C13;
+    temp2 = output[13]*C3;
+    temp1 += temp2;
+    step[13] = (temp1);
+
+    temp1 = output[9]*C5;
+    temp2 = output[14]*C11;
+    temp2 -= temp1;
+    step[14] = (temp2);
+
+    temp1 = output[8]*C9;
+    temp2 = output[15]*C7;
+    temp1 += temp2;
+    step[15] = (temp1);
+
+    // step 5
+    output[0] = (step[0] + step[15]);
+    output[1] = (step[1] + step[14]);
+    output[2] = (step[2] + step[13]);
+    output[3] = (step[3] + step[12]);
+    output[4] = (step[4] + step[11]);
+    output[5] = (step[5] + step[10]);
+    output[6] = (step[6] + step[ 9]);
+    output[7] = (step[7] + step[ 8]);
+
+    output[15] = (step[0] - step[15]);
+    output[14] = (step[1] - step[14]);
+    output[13] = (step[2] - step[13]);
+    output[12] = (step[3] - step[12]);
+    output[11] = (step[4] - step[11]);
+    output[10] = (step[5] - step[10]);
+    output[9] = (step[6] - step[ 9]);
+    output[8] = (step[7] - step[ 8]);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 
 // Remove once an int version of iDCT is written
 #if 0
 void reference_16x16_idct_1d(double input[16], double output[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kSqrt2 = 1.414213562373095048801688724209698;
-  for (int k = 0; k < 16; k++) {
-    output[k] = 0.0;
-    for (int n = 0; n < 16; n++) {
-      output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
-      if (n == 0)
-        output[k] = output[k]/kSqrt2;
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    const double kPi = 3.141592653589793238462643383279502884;
+    const double kSqrt2 = 1.414213562373095048801688724209698;
+    for (int k = 0; k < 16; k++) {
+      output[k] = 0.0;
+      for (int n = 0; n < 16; n++) {
+        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
+        if (n == 0)
+          output[k] = output[k]/kSqrt2;
+      }
     }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
 void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
-  double out[16*16], out2[16*16];
-  const int short_pitch = pitch >> 1;
-  int i, j;
-    // First transform rows
-  for (i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j + i*short_pitch];
-    butterfly_16x16_idct_1d(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      out[j + i*16] = temp_out[j];
-  }
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
-    butterfly_16x16_idct_1d(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      out2[j*16 + i] = temp_out[j];
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double out[16*16], out2[16*16];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+      // First transform rows
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j + i*16] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j*16 + i];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out2[j*16 + i] = temp_out[j];
+    }
+    for (i = 0; i < 16*16; ++i)
+      output[i] = round(out2[i]/128);
   }
-  for (i = 0; i < 16*16; ++i)
-    output[i] = round(out2[i]/128);
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 209ba20b5..81b4e6be2 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
 
@@ -402,87 +403,64 @@ void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) { // pitch = 8
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 void vp8_fht_c(short *input, short *output, int pitch,
                TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                             // the implementation could be simplified in
-                             // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
-
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += tx_dim;
-    ip  += pitch / 2;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &adst_4[0] :
-                            ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-      break;
-
-    default :
-      ptv = (tx_dim == 4) ? &dct_4[0] :
-                            ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-      break;
-  }
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfb[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+                               // the implementation could be simplified in
+                               // conjunction with integer transform
+    short *ip = input;
+    short *op = output;
+
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
+
+    // pointers to vertical and horizontal transforms
+    float *ptv, *pth;
+
+    // load and convert residual array into floating-point
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
       }
-      pfa += 1;
+      pfa += tx_dim;
+      ip  += pitch / 2;
     }
-    pfb += tx_dim;
-    ptv += tx_dim;
+
+    // vertical transformation
     pfa = &bufa[0];
-  }
+    pfb = &bufb[0];
 
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = (tx_dim == 4) ? &adst_4[0] :
-                            ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-      break;
-
-    default :
-      pth = (tx_dim == 4) ? &dct_4[0] :
-                            ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-      break;
-  }
+    switch(tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &adst_4[0] :
+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+        break;
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfa[i] += pfb[k] * pth[k];
+      default :
+        ptv = (tx_dim == 4) ? &dct_4[0] :
+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+        break;
+    }
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfb[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
       }
-      pth += tx_dim;
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
     }
 
-    pfa += tx_dim;
-    pfb += tx_dim;
-    // pth -= tx_dim * tx_dim;
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
 
     switch(tx_type) {
       case ADST_ADST :
@@ -496,20 +474,48 @@ void vp8_fht_c(short *input, short *output, int pitch,
                               ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
         break;
     }
-  }
 
-  // convert to short integer format and load BLOCKD buffer
-  op  = output ;
-  pfa = &bufa[0] ;
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+      }
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-                                   -(short)(- 8 * pfa[i] + 0.49);
+      pfa += tx_dim;
+      pfb += tx_dim;
+      // pth -= tx_dim * tx_dim;
+
+      switch(tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &adst_4[0] :
+                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &dct_4[0] :
+                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+          break;
+      }
+    }
+
+    // convert to short integer format and load BLOCKD buffer
+    op  = output ;
+    pfa = &bufa[0] ;
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
+                                     -(short)(- 8 * pfa[i] + 0.49);
+      }
+      op  += tx_dim;
+      pfa += tx_dim;
     }
-    op  += tx_dim;
-    pfa += tx_dim;
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -705,162 +711,168 @@ static const double C14 = 0.195090322016128;
 static const double C15 = 0.098017140329561;
 
 static void dct16x16_1d(double input[16], double output[16]) {
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
-
-  // step 1
-  step[ 0] = input[0] + input[15];
-  step[ 1] = input[1] + input[14];
-  step[ 2] = input[2] + input[13];
-  step[ 3] = input[3] + input[12];
-  step[ 4] = input[4] + input[11];
-  step[ 5] = input[5] + input[10];
-  step[ 6] = input[6] + input[ 9];
-  step[ 7] = input[7] + input[ 8];
-  step[ 8] = input[7] - input[ 8];
-  step[ 9] = input[6] - input[ 9];
-  step[10] = input[5] - input[10];
-  step[11] = input[4] - input[11];
-  step[12] = input[3] - input[12];
-  step[13] = input[2] - input[13];
-  step[14] = input[1] - input[14];
-  step[15] = input[0] - input[15];
-
-  // step 2
-  output[0] = step[0] + step[7];
-  output[1] = step[1] + step[6];
-  output[2] = step[2] + step[5];
-  output[3] = step[3] + step[4];
-  output[4] = step[3] - step[4];
-  output[5] = step[2] - step[5];
-  output[6] = step[1] - step[6];
-  output[7] = step[0] - step[7];
-
-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
-  output[ 8] = temp1 + temp2;
-
-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
-  output[ 9] = temp1 - temp2;
-
-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
-  output[10] = temp1 + temp2;
-
-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
-  output[11] = temp1 - temp2;
-
-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
-  output[12] = temp2 + temp1;
-
-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
-  output[13] = temp2 - temp1;
-
-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
-  output[14] = temp2 + temp1;
-
-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
-  output[15] = temp2 - temp1;
-
-  // step 3
-  step[ 0] = output[0] + output[3];
-  step[ 1] = output[1] + output[2];
-  step[ 2] = output[1] - output[2];
-  step[ 3] = output[0] - output[3];
-
-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
-  step[ 4] = temp1 + temp2;
-
-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
-  step[ 5] = temp1 + temp2;
-
-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
-  step[ 6] = temp2 - temp1;
-
-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
-  step[ 7] = temp2 - temp1;
-
-  step[ 8] = output[ 8] + output[11];
-  step[ 9] = output[ 9] + output[10];
-  step[10] = output[ 9] - output[10];
-  step[11] = output[ 8] - output[11];
-
-  step[12] = output[12] + output[15];
-  step[13] = output[13] + output[14];
-  step[14] = output[13] - output[14];
-  step[15] = output[12] - output[15];
-
-  // step 4
-  output[ 0] = (step[ 0] + step[ 1]);
-  output[ 8] = (step[ 0] - step[ 1]);
-
-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
-  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
-
-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
-  temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
-
-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
-
-  temp1 = step[4] - step[5];
-  temp2 = step[6] + step[7];
-  output[ 6] = (temp1 + temp2);
-  output[10] = (temp1 - temp2);
-
-  intermediate[8] = step[8] + step[14];
-  intermediate[9] = step[9] + step[15];
-
-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
-  temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
-
-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
-  temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
-
-  output[ 9] = 2*((step[10] + step[11])*C8);
-
-  intermediate[11] = step[10] - step[11];
-  intermediate[12] = step[12] + step[13];
-  intermediate[13] = step[12] - step[13];
-  intermediate[14] = step[ 8] - step[14];
-  intermediate[15] = step[ 9] - step[15];
-
-  output[15] = (intermediate[11] + intermediate[12]);
-  output[ 1] = -(intermediate[11] - intermediate[12]);
-
-  output[ 7] = 2*(intermediate[13]*C8);
-
-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
-  temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
-
-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
-  temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+    // step 1
+    step[ 0] = input[0] + input[15];
+    step[ 1] = input[1] + input[14];
+    step[ 2] = input[2] + input[13];
+    step[ 3] = input[3] + input[12];
+    step[ 4] = input[4] + input[11];
+    step[ 5] = input[5] + input[10];
+    step[ 6] = input[6] + input[ 9];
+    step[ 7] = input[7] + input[ 8];
+    step[ 8] = input[7] - input[ 8];
+    step[ 9] = input[6] - input[ 9];
+    step[10] = input[5] - input[10];
+    step[11] = input[4] - input[11];
+    step[12] = input[3] - input[12];
+    step[13] = input[2] - input[13];
+    step[14] = input[1] - input[14];
+    step[15] = input[0] - input[15];
+
+    // step 2
+    output[0] = step[0] + step[7];
+    output[1] = step[1] + step[6];
+    output[2] = step[2] + step[5];
+    output[3] = step[3] + step[4];
+    output[4] = step[3] - step[4];
+    output[5] = step[2] - step[5];
+    output[6] = step[1] - step[6];
+    output[7] = step[0] - step[7];
+
+    temp1 = step[ 8]*C7;
+    temp2 = step[15]*C9;
+    output[ 8] = temp1 + temp2;
+
+    temp1 = step[ 9]*C11;
+    temp2 = step[14]*C5;
+    output[ 9] = temp1 - temp2;
+
+    temp1 = step[10]*C3;
+    temp2 = step[13]*C13;
+    output[10] = temp1 + temp2;
+
+    temp1 = step[11]*C15;
+    temp2 = step[12]*C1;
+    output[11] = temp1 - temp2;
+
+    temp1 = step[11]*C1;
+    temp2 = step[12]*C15;
+    output[12] = temp2 + temp1;
+
+    temp1 = step[10]*C13;
+    temp2 = step[13]*C3;
+    output[13] = temp2 - temp1;
+
+    temp1 = step[ 9]*C5;
+    temp2 = step[14]*C11;
+    output[14] = temp2 + temp1;
+
+    temp1 = step[ 8]*C9;
+    temp2 = step[15]*C7;
+    output[15] = temp2 - temp1;
+
+    // step 3
+    step[ 0] = output[0] + output[3];
+    step[ 1] = output[1] + output[2];
+    step[ 2] = output[1] - output[2];
+    step[ 3] = output[0] - output[3];
+
+    temp1 = output[4]*C14;
+    temp2 = output[7]*C2;
+    step[ 4] = temp1 + temp2;
+
+    temp1 = output[5]*C10;
+    temp2 = output[6]*C6;
+    step[ 5] = temp1 + temp2;
+
+    temp1 = output[5]*C6;
+    temp2 = output[6]*C10;
+    step[ 6] = temp2 - temp1;
+
+    temp1 = output[4]*C2;
+    temp2 = output[7]*C14;
+    step[ 7] = temp2 - temp1;
+
+    step[ 8] = output[ 8] + output[11];
+    step[ 9] = output[ 9] + output[10];
+    step[10] = output[ 9] - output[10];
+    step[11] = output[ 8] - output[11];
+
+    step[12] = output[12] + output[15];
+    step[13] = output[13] + output[14];
+    step[14] = output[13] - output[14];
+    step[15] = output[12] - output[15];
+
+    // step 4
+    output[ 0] = (step[ 0] + step[ 1]);
+    output[ 8] = (step[ 0] - step[ 1]);
+
+    temp1 = step[2]*C12;
+    temp2 = step[3]*C4;
+    temp1 = temp1 + temp2;
+    output[ 4] = 2*(temp1*C8);
+
+    temp1 = step[2]*C4;
+    temp2 = step[3]*C12;
+    temp1 = temp2 - temp1;
+    output[12] = 2*(temp1*C8);
+
+    output[ 2] = 2*((step[4] + step[ 5])*C8);
+    output[14] = 2*((step[7] - step[ 6])*C8);
+
+    temp1 = step[4] - step[5];
+    temp2 = step[6] + step[7];
+    output[ 6] = (temp1 + temp2);
+    output[10] = (temp1 - temp2);
+
+    intermediate[8] = step[8] + step[14];
+    intermediate[9] = step[9] + step[15];
+
+    temp1 = intermediate[8]*C12;
+    temp2 = intermediate[9]*C4;
+    temp1 = temp1 - temp2;
+    output[3] = 2*(temp1*C8);
+
+    temp1 = intermediate[8]*C4;
+    temp2 = intermediate[9]*C12;
+    temp1 = temp2 + temp1;
+    output[13] = 2*(temp1*C8);
+
+    output[ 9] = 2*((step[10] + step[11])*C8);
+
+    intermediate[11] = step[10] - step[11];
+    intermediate[12] = step[12] + step[13];
+    intermediate[13] = step[12] - step[13];
+    intermediate[14] = step[ 8] - step[14];
+    intermediate[15] = step[ 9] - step[15];
+
+    output[15] = (intermediate[11] + intermediate[12]);
+    output[ 1] = -(intermediate[11] - intermediate[12]);
+
+    output[ 7] = 2*(intermediate[13]*C8);
+
+    temp1 = intermediate[14]*C12;
+    temp2 = intermediate[15]*C4;
+    temp1 = temp1 - temp2;
+    output[11] = -2*(temp1*C8);
+
+    temp1 = intermediate[14]*C4;
+    temp2 = intermediate[15]*C12;
+    temp1 = temp2 + temp1;
+    output[ 5] = 2*(temp1*C8);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 
 void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
     int shortpitch = pitch >> 1;
     int i, j;
     double output[256];
@@ -885,5 +897,7 @@ void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
     // Scale by some magic number
     for (i = 0; i < 256; i++)
         out[i] = (short)round(output[i]/2);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
-- 
2.50.1