ARM64: Avoid tbl instruction on Cortex-A53/A57

author DRC <information@libjpeg-turbo.org>

Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)

committer DRC <information@libjpeg-turbo.org>

Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)
author DRC <information@libjpeg-turbo.org>
Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)
committer DRC <information@libjpeg-turbo.org>
Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)
diff --git a/simd/jsimd.h b/simd/jsimd.h

index a312930f57e3eb770a5dd324b6f4089b89b7d11a..d05a2ec060a3429ac16f3a1de681d59f451e4d74 100644 (file)
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -865,3 +865,7 @@ EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
  EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
          (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
           c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
+        (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c

index 8633162211207855f8083adb97765bc77241f504..cb4825844e8fb7a8604d2e7b880a3ab9cb6f83c6 100644 (file)
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -28,10 +28,12 @@
  
  #define JSIMD_FASTLD3 1
  #define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
  
  static unsigned int simd_support = ~0;
  static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
  
  #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
  
@@ -83,7 +85,13 @@ parse_proc_cpuinfo (int bufsize)
          free(buffer);
          return 0;
        }
-      if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
          /* The SIMD version of Huffman encoding is slower than the C version on
             Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
             CPU. */
@@ -785,6 +793,10 @@ jsimd_huff_encode_one_block (void * state, JOCTET *buffer, JCOEFPTR block,
                               int last_dc_val, c_derived_tbl *dctbl,
                               c_derived_tbl *actbl)
  {
-  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
+  if (simd_features & JSIMD_FASTTBL)
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
  }
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S

index 0df1c4a883d7a728b22c8a6330821c7b02f9e7c8..b9bb5de4cf97c1db72da384d38781c65de74e30d 100644 (file)
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -2986,8 +2986,6 @@ asm_function jsimd_h2v2_downsample_neon
  
  /*****************************************************************************/
  
-#define TBL_IS_FAST
-
  /*
   * GLOBAL(JOCTET*)
   * jsimd_huff_encode_one_block (working_state * state, JOCTET *buffer,
@@ -3037,11 +3035,17 @@ asm_function jsimd_h2v2_downsample_neon
  47:
  .endm
  
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
  .balign 16
+.if \fast_tbl == 1
  Ljsimd_huff_encode_one_block_neon_consts:
+.else
+Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
+.endif
      .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
            0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
      .byte    0,   1,   2,   3,  16,  17,  32,  33, \
              18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
      .byte   34,  35,  48,  49, 255, 255,  50,  51, \
@@ -3066,19 +3070,27 @@ Ljsimd_huff_encode_one_block_neon_consts:
             255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
      .byte    4,   5,   6,   7, 255, 255, 255, 255, \
             255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-#endif
+.endif
  
+.if \fast_tbl == 1
  asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
      sub             sp, sp, 272
      sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
      /* Save ARM registers */
      stp             x19, x20, [sp], 16
+.if \fast_tbl == 1
      adr             x15, Ljsimd_huff_encode_one_block_neon_consts
+.else
+    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
+.endif
      ldr             PUT_BUFFER, [x0, #0x10]
      ldr             PUT_BITSw, [x0, #0x18]
      ldrsh           w12, [x2]               /* load DC coeff in w12 */
      /* prepare data */
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
      ld1             {v23.16b}, [x15], #16
      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
      ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
@@ -3100,7 +3112,7 @@ asm_function jsimd_huff_encode_one_block_neon
      tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
      tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
      tbx             v6.16b, {v31.16b}, v19.16b
-#else
+.else
        add             x13, x2, #0x22
        sub             w12, w12, w3    /* last_dc_val, not used afterwards */
      ld1             {v23.16b}, [x15]
@@ -3230,7 +3242,7 @@ asm_function jsimd_huff_encode_one_block_neon
      ld1             {v5.h}[7], [x15]
      ld1             {v6.h}[7], [x19]
      ld1             {v7.h}[7], [x20]
-#endif
+.endif
      cmlt            v24.8h, v0.8h, #0
      cmlt            v25.8h, v1.8h, #0
      cmlt            v26.8h, v2.8h, #0
@@ -3425,6 +3437,11 @@ asm_function jsimd_huff_encode_one_block_neon
      add             sp, sp, 256
      br              x30
  
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
      .unreq          BUFFER
      .unreq          PUT_BUFFER
      .unreq          PUT_BITS
author	DRC <information@libjpeg-turbo.org>
	Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)
committer	DRC <information@libjpeg-turbo.org>
	Tue, 9 Feb 2016 06:38:58 +0000 (00:38 -0600)
simd/jsimd.h		patch \| blob \| history
simd/jsimd_arm64.c		patch \| blob \| history
simd/jsimd_arm64_neon.S		patch \| blob \| history