vp9 quantize: change index
authorJohann <johann.koenig@gmail.com>
Sat, 1 Oct 2022 02:18:09 +0000 (11:18 +0900)
committerJohann <johann.koenig@gmail.com>
Sat, 1 Oct 2022 02:50:46 +0000 (11:50 +0900)
In assembly it made sense to iterate using n_coeffs.
In intrinsics it's just as fast to use index and
easier to read.

Change-Id: I403c959709309dad68123d0a3d0efe183874543d

vp9/encoder/x86/vp9_quantize_sse2.c

index da4cd9ee8f79e4a0e64dafb5f9a3b9495345b8dd..272e5fb07998dbb4426e4bfa5b323eae54eb5424 100644 (file)
@@ -26,72 +26,58 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
-  __m128i eob;
+  int index = 16;
   __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
 
   (void)scan;
 
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-
   // Setup global values.
   load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
 
-  {
-    __m128i coeff0, coeff1;
-    __m128i coeff0_sign, coeff1_sign;
-    __m128i qcoeff0, qcoeff1;
-    // Do DC and first 15 AC.
-    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-    // Poor man's abs().
-    coeff0_sign = _mm_srai_epi16(coeff0, 15);
-    coeff1_sign = _mm_srai_epi16(coeff1, 15);
-    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-    qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
 
-    round = _mm_unpackhi_epi64(round, round);
-    quant = _mm_unpackhi_epi64(quant, quant);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
 
-    qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-    qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-    // Reinsert signs.
-    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-    store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-    store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-    dequant = _mm_unpackhi_epi64(dequant, dequant);
-    qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-    store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs);
-    store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8);
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
 
-    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero);
-
-    n_coeffs += 8 * 2;
-  }
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   thr = _mm_srai_epi16(dequant, 1);
 
   // AC only loop.
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    __m128i coeff0_sign, coeff1_sign;
-    __m128i qcoeff0, qcoeff1;
-
-    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
 
     // Poor man's abs().
     coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -112,28 +98,27 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
       qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
       coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
       coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+      store_tran_low(coeff0, dqcoeff_ptr + index);
+      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
     } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
 
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
 
     if (nzflag) {
-      const __m128i eob0 =
-          scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero);
+      const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero);
       eob = _mm_max_epi16(eob, eob0);
     }
-    n_coeffs += 8 * 2;
+    index += 16;
   }
 
   *eob_ptr = accumulate_eob(eob);