[WebAssembly] Optimize BUILD_VECTOR lowering for size

author Thomas Lively <tlively@google.com>

Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)

committer Thomas Lively <tlively@google.com>

Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)
author Thomas Lively <tlively@google.com>
Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)
committer Thomas Lively <tlively@google.com>
Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

index 1a8b598c074f494492c78d08067cee35961d6df9..7cb7f2750ffb9eeda95a9892c62a65f0b4c7c915 100644 (file)
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -131,6 +131,13 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
        for (auto T : {MVT::v16i8, MVT::v8i16})
          setOperationAction(Op, T, Legal);
  
+    // Custom lower BUILD_VECTORs to minimize number of replace_lanes
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+      setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+    if (Subtarget->hasUnimplementedSIMD128())
+      for (auto T : {MVT::v2i64, MVT::v2f64})
+        setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+
      // We have custom shuffle lowering to expose the shuffle mask
      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
        setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
@@ -886,6 +893,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
      return LowerINTRINSIC_VOID(Op, DAG);
    case ISD::SIGN_EXTEND_INREG:
      return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
    case ISD::VECTOR_SHUFFLE:
      return LowerVECTOR_SHUFFLE(Op, DAG);
    case ISD::SHL:
@@ -1103,6 +1112,107 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    return SDValue();
  }
  
+SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const EVT VecT = Op.getValueType();
+  const EVT LaneT = Op.getOperand(0).getValueType();
+  const size_t Lanes = Op.getNumOperands();
+  auto IsConstant = [](const SDValue &V) {
+    return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
+  };
+
+  // Find the most common operand, which is approximately the best to splat
+  using Entry = std::pair<SDValue, size_t>;
+  SmallVector<Entry, 16> ValueCounts;
+  size_t NumConst = 0, NumDynamic = 0;
+  for (const SDValue &Lane : Op->op_values()) {
+    if (Lane.isUndef()) {
+      continue;
+    } else if (IsConstant(Lane)) {
+      NumConst++;
+    } else {
+      NumDynamic++;
+    }
+    auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
+                                [&Lane](Entry A) { return A.first == Lane; });
+    if (CountIt == ValueCounts.end()) {
+      ValueCounts.emplace_back(Lane, 1);
+    } else {
+      CountIt->second++;
+    }
+  }
+  auto CommonIt =
+      std::max_element(ValueCounts.begin(), ValueCounts.end(),
+                       [](Entry A, Entry B) { return A.second < B.second; });
+  assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
+  SDValue SplatValue = CommonIt->first;
+  size_t NumCommon = CommonIt->second;
+
+  // If v128.const is available, consider using it instead of a splat
+  if (Subtarget->hasUnimplementedSIMD128()) {
+    // {i32,i64,f32,f64}.const opcode, and value
+    const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
+    // SIMD prefix and opcode
+    const size_t SplatBytes = 2;
+    const size_t SplatConstBytes = SplatBytes + ConstBytes;
+    // SIMD prefix, opcode, and lane index
+    const size_t ReplaceBytes = 3;
+    const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
+    // SIMD prefix, v128.const opcode, and 128-bit value
+    const size_t VecConstBytes = 18;
+    // Initial v128.const and a replace_lane for each non-const operand
+    const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
+    // Initial splat and all necessary replace_lanes
+    const size_t SplatInitBytes =
+        IsConstant(SplatValue)
+            // Initial constant splat
+            ? (SplatConstBytes +
+               // Constant replace_lanes
+               (NumConst - NumCommon) * ReplaceConstBytes +
+               // Dynamic replace_lanes
+               (NumDynamic * ReplaceBytes))
+            // Initial dynamic splat
+            : (SplatBytes +
+               // Constant replace_lanes
+               (NumConst * ReplaceConstBytes) +
+               // Dynamic replace_lanes
+               (NumDynamic - NumCommon) * ReplaceBytes);
+    if (ConstInitBytes < SplatInitBytes) {
+      // Create build_vector that will lower to initial v128.const
+      SmallVector<SDValue, 16> ConstLanes;
+      for (const SDValue &Lane : Op->op_values()) {
+        if (IsConstant(Lane)) {
+          ConstLanes.push_back(Lane);
+        } else if (LaneT.isFloatingPoint()) {
+          ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
+        } else {
+          ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
+        }
+      }
+      SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+      // Add replace_lane instructions for non-const lanes
+      for (size_t I = 0; I < Lanes; ++I) {
+        const SDValue &Lane = Op->getOperand(I);
+        if (!Lane.isUndef() && !IsConstant(Lane))
+          Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+                               DAG.getConstant(I, DL, MVT::i32));
+      }
+      return Result;
+    }
+  }
+  // Use a splat for the initial vector
+  SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+  // Add replace_lane instructions for other values
+  for (size_t I = 0; I < Lanes; ++I) {
+    const SDValue &Lane = Op->getOperand(I);
+    if (Lane != SplatValue)
+      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+                           DAG.getConstant(I, DL, MVT::i32));
+  }
+  return Result;
+}
+
  SDValue
  WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                 SelectionDAG &DAG) const {
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h

index 33f384b44f192716667ed4f684a9e2a5d35cb568..d4b6dcacafc59b8e0c7cb24aca0e832f94969513 100644 (file)
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -99,6 +99,7 @@ private:
    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

index 2476d8d99cfb8ae6358046f5191889f66908a68f..b7ecd49c7937da0db89ae07561d0592df4d3e7e3 100644 (file)
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -359,118 +359,6 @@ def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
  def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
            (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
  
-// Arbitrary other BUILD_VECTOR patterns
-def : Pat<(v16i8 (build_vector
-            (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
-            (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7),
-            (i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11),
-            (i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15)
-          )),
-          (v16i8 (REPLACE_LANE_v16i8
-            (v16i8 (REPLACE_LANE_v16i8
-              (v16i8 (REPLACE_LANE_v16i8
-                (v16i8 (REPLACE_LANE_v16i8
-                  (v16i8 (REPLACE_LANE_v16i8
-                    (v16i8 (REPLACE_LANE_v16i8
-                      (v16i8 (REPLACE_LANE_v16i8
-                        (v16i8 (REPLACE_LANE_v16i8
-                          (v16i8 (REPLACE_LANE_v16i8
-                            (v16i8 (REPLACE_LANE_v16i8
-                              (v16i8 (REPLACE_LANE_v16i8
-                                (v16i8 (REPLACE_LANE_v16i8
-                                  (v16i8 (REPLACE_LANE_v16i8
-                                    (v16i8 (REPLACE_LANE_v16i8
-                                      (v16i8 (REPLACE_LANE_v16i8
-                                        (v16i8 (SPLAT_v16i8 (i32 I32:$x0))),
-                                        1, I32:$x1
-                                      )),
-                                      2, I32:$x2
-                                    )),
-                                    3, I32:$x3
-                                  )),
-                                  4, I32:$x4
-                                )),
-                                5, I32:$x5
-                              )),
-                              6, I32:$x6
-                            )),
-                            7, I32:$x7
-                          )),
-                          8, I32:$x8
-                        )),
-                        9, I32:$x9
-                      )),
-                      10, I32:$x10
-                    )),
-                    11, I32:$x11
-                  )),
-                  12, I32:$x12
-                )),
-                13, I32:$x13
-              )),
-              14, I32:$x14
-            )),
-            15, I32:$x15
-          ))>;
-def : Pat<(v8i16 (build_vector
-            (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
-            (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7)
-          )),
-          (v8i16 (REPLACE_LANE_v8i16
-            (v8i16 (REPLACE_LANE_v8i16
-              (v8i16 (REPLACE_LANE_v8i16
-                (v8i16 (REPLACE_LANE_v8i16
-                  (v8i16 (REPLACE_LANE_v8i16
-                    (v8i16 (REPLACE_LANE_v8i16
-                      (v8i16 (REPLACE_LANE_v8i16
-                        (v8i16 (SPLAT_v8i16 (i32 I32:$x0))),
-                        1, I32:$x1
-                      )),
-                      2, I32:$x2
-                    )),
-                    3, I32:$x3
-                  )),
-                  4, I32:$x4
-                )),
-                5, I32:$x5
-              )),
-              6, I32:$x6
-            )),
-            7, I32:$x7
-          ))>;
-def : Pat<(v4i32 (build_vector
-            (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3)
-          )),
-          (v4i32 (REPLACE_LANE_v4i32
-            (v4i32 (REPLACE_LANE_v4i32
-              (v4i32 (REPLACE_LANE_v4i32
-                (v4i32 (SPLAT_v4i32 (i32 I32:$x0))),
-                1, I32:$x1
-              )),
-              2, I32:$x2
-            )),
-            3, I32:$x3
-          ))>;
-def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))),
-          (v2i64 (REPLACE_LANE_v2i64
-            (v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>;
-def : Pat<(v4f32 (build_vector
-            (f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3)
-          )),
-          (v4f32 (REPLACE_LANE_v4f32
-            (v4f32 (REPLACE_LANE_v4f32
-              (v4f32 (REPLACE_LANE_v4f32
-                (v4f32 (SPLAT_v4f32 (f32 F32:$x0))),
-                1, F32:$x1
-              )),
-              2, F32:$x2
-            )),
-            3, F32:$x3
-          ))>;
-def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
-          (v2f64 (REPLACE_LANE_v2f64
-            (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
-
  //===----------------------------------------------------------------------===//
  // Comparisons
  //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/WebAssembly/simd-build-vector.ll b/test/CodeGen/WebAssembly/simd-build-vector.ll

new file mode 100644 (file)

index 0000000..ab08ef4
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -0,0 +1,127 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s
+
+; Test that the logic to choose between v128.const vector
+; initialization and splat vector initialization and to optimize the
+; choice of splat value works correctly.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: same_const_one_replaced_i8x16:
+; CHECK-NEXT:  .functype       same_const_one_replaced_i8x16 (i32) -> (v128)
+; CHECK-NEXT:  i32.const       $push[[L0:[0-9]+]]=, 42
+; CHECK-NEXT:  i16x8.splat     $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L2:[0-9]+]]=, $pop[[L1]], 5, $0
+; CHECK-NEXT:  return          $pop[[L2]]
+define <8 x i16> @same_const_one_replaced_i8x16(i16 %x) {
+  %v = insertelement
+    <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>,
+    i16 %x,
+    i32 5
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: different_const_one_replaced_i8x16:
+; CHECK-NEXT:  .functype       different_const_one_replaced_i8x16 (i32) -> (v128)
+; CHECK-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 1, 2, 3, 4, 5, 0, 7, 8
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
+; CHECK-NEXT:  return          $pop[[L1]]
+define <8 x i16> @different_const_one_replaced_i8x16(i16 %x) {
+  %v = insertelement
+    <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>,
+    i16 %x,
+    i32 5
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: same_const_one_replaced_f32x4:
+; CHECK-NEXT:  .functype       same_const_one_replaced_f32x4 (f32) -> (v128)
+; CHECK-NEXT:  f32.const       $push[[L0:[0-9]+]]=, 0x1.5p5
+; CHECK-NEXT:  f32x4.splat     $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT:  f32x4.replace_lane      $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
+; CHECK-NEXT:  return          $pop[[L2]]
+define <4 x float> @same_const_one_replaced_f32x4(float %x) {
+  %v = insertelement
+    <4 x float> <float 42., float 42., float 42., float 42.>,
+    float %x,
+    i32 2
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: different_const_one_replaced_f32x4:
+; CHECK-NEXT:  .functype       different_const_one_replaced_f32x4 (f32) -> (v128)
+; CHECK-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2
+; CHECK-NEXT:  f32x4.replace_lane      $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0
+; CHECK-NEXT:  return          $pop[[L1]]
+define <4 x float> @different_const_one_replaced_f32x4(float %x) {
+  %v = insertelement
+    <4 x float> <float 1., float 2., float 3., float 4.>,
+    float %x,
+    i32 2
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: splat_common_const_i32x4:
+; CHECK-NEXT:  .functype       splat_common_const_i32x4 () -> (v128)
+; CHECK-NEXT:  i32.const       $push[[L0:[0-9]+]]=, 3
+; CHECK-NEXT:  i32x4.splat     $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT:  i32.const       $push[[L2:[0-9]+]]=, 1
+; CHECK-NEXT:  i32x4.replace_lane      $push[[L3:[0-9]+]]=, $pop[[L1]], 3, $pop[[L2]]
+; CHECK-NEXT:  return          $pop[[L3]]
+define <4 x i32> @splat_common_const_i32x4() {
+  ret <4 x i32> <i32 undef, i32 3, i32 3, i32 1>
+}
+
+; CHECK-LABEL: splat_common_arg_i16x8:
+; CHECK-NEXT:  .functype       splat_common_arg_i16x8 (i32, i32, i32) -> (v128)
+; CHECK-NEXT:  i16x8.splat     $push[[L0:[0-9]+]]=, $2
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L1:[0-9]+]]=, $pop[[L0]], 0, $1
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L3:[0-9]+]]=, $pop[[L2]], 4, $1
+; CHECK-NEXT:  i16x8.replace_lane      $push[[L4:[0-9]+]]=, $pop[[L3]], 7, $1
+; CHECK-NEXT:  return          $pop[[L4]]
+define <8 x i16> @splat_common_arg_i16x8(i16 %a, i16 %b, i16 %c) {
+  %v0 = insertelement <8 x i16> undef, i16 %b, i32 0
+  %v1 = insertelement <8 x i16> %v0, i16 %c, i32 1
+  %v2 = insertelement <8 x i16> %v1, i16 %a, i32 2
+  %v3 = insertelement <8 x i16> %v2, i16 %c, i32 3
+  %v4 = insertelement <8 x i16> %v3, i16 %b, i32 4
+  %v5 = insertelement <8 x i16> %v4, i16 %c, i32 5
+  %v6 = insertelement <8 x i16> %v5, i16 %c, i32 6
+  %v7 = insertelement <8 x i16> %v6, i16 %b, i32 7
+  ret <8 x i16> %v7
+}
+
+; CHECK-LABEL: undef_const_insert_f32x4:
+; CHECK-NEXT:  .functype       undef_const_insert_f32x4 () -> (v128)
+; CHECK-NEXT:  f32.const       $push[[L0:[0-9]+]]=, 0x1.5p5
+; CHECK-NEXT:  f32x4.splat     $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT:  return          $pop[[L1]]
+define <4 x float> @undef_const_insert_f32x4() {
+  %v = insertelement <4 x float> undef, float 42., i32 1
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: undef_arg_insert_i32x4:
+; CHECK-NEXT:  .functype       undef_arg_insert_i32x4 (i32) -> (v128)
+; CHECK-NEXT:  i32x4.splat     $push[[L0:[0-9]+]]=, $0
+; CHECK-NEXT:  return          $pop[[L0]]
+define <4 x i32> @undef_arg_insert_i32x4(i32 %x) {
+  %v = insertelement <4 x i32> undef, i32 %x, i32 3
+  ret <4 x i32> %v
+}
+
+; CHECK-LABEL: all_undef_i8x16:
+; CHECK-NEXT:  .functype       all_undef_i8x16 () -> (v128)
+; CHECK-NEXT:  return          $0
+define <16 x i8> @all_undef_i8x16() {
+  %v = insertelement <16 x i8> undef, i8 undef, i32 4
+  ret <16 x i8> %v
+}
+
+; CHECK-LABEL: all_undef_f64x2:
+; CHECK-NEXT:  .functype       all_undef_f64x2 () -> (v128)
+; CHECK-NEXT:  return          $0
+define <2 x double> @all_undef_f64x2() {
+  ret <2 x double> undef
+}
author	Thomas Lively <tlively@google.com>
	Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)
committer	Thomas Lively <tlively@google.com>
	Wed, 30 Jan 2019 02:23:29 +0000 (02:23 +0000)
lib/Target/WebAssembly/WebAssemblyISelLowering.cpp		patch \| blob \| history
lib/Target/WebAssembly/WebAssemblyISelLowering.h		patch \| blob \| history
lib/Target/WebAssembly/WebAssemblyInstrSIMD.td		patch \| blob \| history
test/CodeGen/WebAssembly/simd-build-vector.ll	[new file with mode: 0644]	patch \| blob