for (auto T : {MVT::v16i8, MVT::v8i16})
setOperationAction(Op, T, Legal);
+ // Custom lower BUILD_VECTORs to minimize number of replace_lanes
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+ if (Subtarget->hasUnimplementedSIMD128())
+ for (auto T : {MVT::v2i64, MVT::v2f64})
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+
// We have custom shuffle lowering to expose the shuffle mask
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
return LowerINTRINSIC_VOID(Op, DAG);
case ISD::SIGN_EXTEND_INREG:
return LowerSIGN_EXTEND_INREG(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::SHL:
return SDValue();
}
+SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const EVT VecT = Op.getValueType();
+ const EVT LaneT = Op.getOperand(0).getValueType();
+ const size_t Lanes = Op.getNumOperands();
+ auto IsConstant = [](const SDValue &V) {
+ return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
+ };
+
+ // Find the most common operand, which is approximately the best to splat
+ using Entry = std::pair<SDValue, size_t>;
+ SmallVector<Entry, 16> ValueCounts;
+ size_t NumConst = 0, NumDynamic = 0;
+ for (const SDValue &Lane : Op->op_values()) {
+ if (Lane.isUndef()) {
+ continue;
+ } else if (IsConstant(Lane)) {
+ NumConst++;
+ } else {
+ NumDynamic++;
+ }
+ auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
+ [&Lane](Entry A) { return A.first == Lane; });
+ if (CountIt == ValueCounts.end()) {
+ ValueCounts.emplace_back(Lane, 1);
+ } else {
+ CountIt->second++;
+ }
+ }
+ auto CommonIt =
+ std::max_element(ValueCounts.begin(), ValueCounts.end(),
+ [](Entry A, Entry B) { return A.second < B.second; });
+ assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
+ SDValue SplatValue = CommonIt->first;
+ size_t NumCommon = CommonIt->second;
+
+ // If v128.const is available, consider using it instead of a splat
+ if (Subtarget->hasUnimplementedSIMD128()) {
+ // {i32,i64,f32,f64}.const opcode, and value
+ const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
+ // SIMD prefix and opcode
+ const size_t SplatBytes = 2;
+ const size_t SplatConstBytes = SplatBytes + ConstBytes;
+ // SIMD prefix, opcode, and lane index
+ const size_t ReplaceBytes = 3;
+ const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
+ // SIMD prefix, v128.const opcode, and 128-bit value
+ const size_t VecConstBytes = 18;
+ // Initial v128.const and a replace_lane for each non-const operand
+ const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
+ // Initial splat and all necessary replace_lanes
+ const size_t SplatInitBytes =
+ IsConstant(SplatValue)
+ // Initial constant splat
+ ? (SplatConstBytes +
+ // Constant replace_lanes
+ (NumConst - NumCommon) * ReplaceConstBytes +
+ // Dynamic replace_lanes
+ (NumDynamic * ReplaceBytes))
+ // Initial dynamic splat
+ : (SplatBytes +
+ // Constant replace_lanes
+ (NumConst * ReplaceConstBytes) +
+ // Dynamic replace_lanes
+ (NumDynamic - NumCommon) * ReplaceBytes);
+ if (ConstInitBytes < SplatInitBytes) {
+ // Create build_vector that will lower to initial v128.const
+ SmallVector<SDValue, 16> ConstLanes;
+ for (const SDValue &Lane : Op->op_values()) {
+ if (IsConstant(Lane)) {
+ ConstLanes.push_back(Lane);
+ } else if (LaneT.isFloatingPoint()) {
+ ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
+ } else {
+ ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
+ }
+ }
+ SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+ // Add replace_lane instructions for non-const lanes
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op->getOperand(I);
+ if (!Lane.isUndef() && !IsConstant(Lane))
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+ DAG.getConstant(I, DL, MVT::i32));
+ }
+ return Result;
+ }
+ }
+ // Use a splat for the initial vector
+ SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ // Add replace_lane instructions for other values
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op->getOperand(I);
+ if (Lane != SplatValue)
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+ DAG.getConstant(I, DL, MVT::i32));
+ }
+ return Result;
+}
+
SDValue
WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
(REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
-// Arbitrary other BUILD_VECTOR patterns
-def : Pat<(v16i8 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
- (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7),
- (i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11),
- (i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15)
- )),
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (SPLAT_v16i8 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- )),
- 4, I32:$x4
- )),
- 5, I32:$x5
- )),
- 6, I32:$x6
- )),
- 7, I32:$x7
- )),
- 8, I32:$x8
- )),
- 9, I32:$x9
- )),
- 10, I32:$x10
- )),
- 11, I32:$x11
- )),
- 12, I32:$x12
- )),
- 13, I32:$x13
- )),
- 14, I32:$x14
- )),
- 15, I32:$x15
- ))>;
-def : Pat<(v8i16 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
- (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7)
- )),
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (SPLAT_v8i16 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- )),
- 4, I32:$x4
- )),
- 5, I32:$x5
- )),
- 6, I32:$x6
- )),
- 7, I32:$x7
- ))>;
-def : Pat<(v4i32 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3)
- )),
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (SPLAT_v4i32 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- ))>;
-def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))),
- (v2i64 (REPLACE_LANE_v2i64
- (v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>;
-def : Pat<(v4f32 (build_vector
- (f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3)
- )),
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (SPLAT_v4f32 (f32 F32:$x0))),
- 1, F32:$x1
- )),
- 2, F32:$x2
- )),
- 3, F32:$x3
- ))>;
-def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
- (v2f64 (REPLACE_LANE_v2f64
- (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
-
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//
--- /dev/null
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s
+
+; Test that the logic to choose between v128.const vector
+; initialization and splat vector initialization and to optimize the
+; choice of splat value works correctly.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: same_const_one_replaced_i8x16:
+; CHECK-NEXT: .functype same_const_one_replaced_i8x16 (i32) -> (v128)
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 42
+; CHECK-NEXT: i16x8.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 5, $0
+; CHECK-NEXT: return $pop[[L2]]
+define <8 x i16> @same_const_one_replaced_i8x16(i16 %x) {
+ %v = insertelement
+ <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>,
+ i16 %x,
+ i32 5
+ ret <8 x i16> %v
+}
+
+; CHECK-LABEL: different_const_one_replaced_i8x16:
+; CHECK-NEXT: .functype different_const_one_replaced_i8x16 (i32) -> (v128)
+; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, 2, 3, 4, 5, 0, 7, 8
+; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
+; CHECK-NEXT: return $pop[[L1]]
+define <8 x i16> @different_const_one_replaced_i8x16(i16 %x) {
+ %v = insertelement
+ <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>,
+ i16 %x,
+ i32 5
+ ret <8 x i16> %v
+}
+
+; CHECK-LABEL: same_const_one_replaced_f32x4:
+; CHECK-NEXT: .functype same_const_one_replaced_f32x4 (f32) -> (v128)
+; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5
+; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT: f32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
+; CHECK-NEXT: return $pop[[L2]]
+define <4 x float> @same_const_one_replaced_f32x4(float %x) {
+ %v = insertelement
+ <4 x float> <float 42., float 42., float 42., float 42.>,
+ float %x,
+ i32 2
+ ret <4 x float> %v
+}
+
+; CHECK-LABEL: different_const_one_replaced_f32x4:
+; CHECK-NEXT: .functype different_const_one_replaced_f32x4 (f32) -> (v128)
+; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2
+; CHECK-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0
+; CHECK-NEXT: return $pop[[L1]]
+define <4 x float> @different_const_one_replaced_f32x4(float %x) {
+ %v = insertelement
+ <4 x float> <float 1., float 2., float 3., float 4.>,
+ float %x,
+ i32 2
+ ret <4 x float> %v
+}
+
+; CHECK-LABEL: splat_common_const_i32x4:
+; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128)
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 3
+; CHECK-NEXT: i32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 1
+; CHECK-NEXT: i32x4.replace_lane $push[[L3:[0-9]+]]=, $pop[[L1]], 3, $pop[[L2]]
+; CHECK-NEXT: return $pop[[L3]]
+define <4 x i32> @splat_common_const_i32x4() {
+ ret <4 x i32> <i32 undef, i32 3, i32 3, i32 1>
+}
+
+; CHECK-LABEL: splat_common_arg_i16x8:
+; CHECK-NEXT: .functype splat_common_arg_i16x8 (i32, i32, i32) -> (v128)
+; CHECK-NEXT: i16x8.splat $push[[L0:[0-9]+]]=, $2
+; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 0, $1
+; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
+; CHECK-NEXT: i16x8.replace_lane $push[[L3:[0-9]+]]=, $pop[[L2]], 4, $1
+; CHECK-NEXT: i16x8.replace_lane $push[[L4:[0-9]+]]=, $pop[[L3]], 7, $1
+; CHECK-NEXT: return $pop[[L4]]
+define <8 x i16> @splat_common_arg_i16x8(i16 %a, i16 %b, i16 %c) {
+ %v0 = insertelement <8 x i16> undef, i16 %b, i32 0
+ %v1 = insertelement <8 x i16> %v0, i16 %c, i32 1
+ %v2 = insertelement <8 x i16> %v1, i16 %a, i32 2
+ %v3 = insertelement <8 x i16> %v2, i16 %c, i32 3
+ %v4 = insertelement <8 x i16> %v3, i16 %b, i32 4
+ %v5 = insertelement <8 x i16> %v4, i16 %c, i32 5
+ %v6 = insertelement <8 x i16> %v5, i16 %c, i32 6
+ %v7 = insertelement <8 x i16> %v6, i16 %b, i32 7
+ ret <8 x i16> %v7
+}
+
+; CHECK-LABEL: undef_const_insert_f32x4:
+; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128)
+; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5
+; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; CHECK-NEXT: return $pop[[L1]]
+define <4 x float> @undef_const_insert_f32x4() {
+ %v = insertelement <4 x float> undef, float 42., i32 1
+ ret <4 x float> %v
+}
+
+; CHECK-LABEL: undef_arg_insert_i32x4:
+; CHECK-NEXT: .functype undef_arg_insert_i32x4 (i32) -> (v128)
+; CHECK-NEXT: i32x4.splat $push[[L0:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[L0]]
+define <4 x i32> @undef_arg_insert_i32x4(i32 %x) {
+ %v = insertelement <4 x i32> undef, i32 %x, i32 3
+ ret <4 x i32> %v
+}
+
+; CHECK-LABEL: all_undef_i8x16:
+; CHECK-NEXT: .functype all_undef_i8x16 () -> (v128)
+; CHECK-NEXT: return $0
+define <16 x i8> @all_undef_i8x16() {
+ %v = insertelement <16 x i8> undef, i8 undef, i32 4
+ ret <16 x i8> %v
+}
+
+; CHECK-LABEL: all_undef_f64x2:
+; CHECK-NEXT: .functype all_undef_f64x2 () -> (v128)
+; CHECK-NEXT: return $0
+define <2 x double> @all_undef_f64x2() {
+ ret <2 x double> undef
+}