From b7b4ba37dea05c3d0b5fc3bc7619d436ee01b18e Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Tue, 14 Jun 2016 18:53:20 +0000 Subject: [PATCH] [X86] Reduce the width of multiplification when its operands are extended from i8 or i16 For type mul, pmuludq will be used for targets without SSE41, which often introduces many extra pack and unpack instructions in vectorized loop body because pmuludq generates type value. However when the operands of mul are extended from smaller size values like i8 and i16, the type of mul may be shrunk to use pmullw + pmulhw/pmulhuw instead of pmuludq, which generates better code. For targets with SSE41, pmulld is supported so no shrinking is needed. Differential Revision: http://reviews.llvm.org/D20931 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272694 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 2 + lib/Target/X86/X86ISelLowering.cpp | 211 ++++- test/CodeGen/X86/shrink_vmul.ll | 864 ++++++++++++++++++ 3 files changed, 1074 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/X86/shrink_vmul.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 9ba68bf2be3..572fecac219 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -670,6 +670,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::FADD: case ISD::FSUB: case ISD::FMUL: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 69cf0d269a5..06a0aa39603 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26962,10 +26962,216 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Different mul shrinking modes. +enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; + +static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { + EVT VT = N->getOperand(0).getValueType(); + if (VT.getScalarSizeInBits() != 32) + return false; + + assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); + unsigned SignBits[2] = {1, 1}; + bool IsPositive[2] = {false, false}; + for (unsigned i = 0; i < 2; i++) { + SDValue Opd = N->getOperand(i); + + // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to + // compute signbits for it separately. + if (Opd.getOpcode() == ISD::ANY_EXTEND) { + // For anyextend, it is safe to assume an appropriate number of leading + // sign/zero bits. + if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) + SignBits[i] = 25; + else if (Opd.getOperand(0).getValueType().getVectorElementType() == + MVT::i16) + SignBits[i] = 17; + else + return false; + IsPositive[i] = true; + } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) { + // All the operands of BUILD_VECTOR need to be int constant. + // Find the smallest value range which all the operands belong to. + SignBits[i] = 32; + IsPositive[i] = true; + for (const SDValue &SubOp : Opd.getNode()->op_values()) { + if (SubOp.isUndef()) + continue; + auto *CN = dyn_cast(SubOp); + if (!CN) + return false; + APInt IntVal = CN->getAPIntValue(); + if (IntVal.isNegative()) + IsPositive[i] = false; + SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits()); + } + } else { + SignBits[i] = DAG.ComputeNumSignBits(Opd); + if (Opd.getOpcode() == ISD::ZERO_EXTEND) + IsPositive[i] = true; + } + } + + bool AllPositive = IsPositive[0] && IsPositive[1]; + unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); + // When ranges are from -128 ~ 127, use MULS8 mode. + if (MinSignBits >= 25) + Mode = MULS8; + // When ranges are from 0 ~ 255, use MULU8 mode. + else if (AllPositive && MinSignBits >= 24) + Mode = MULU8; + // When ranges are from -32768 ~ 32767, use MULS16 mode. + else if (MinSignBits >= 17) + Mode = MULS16; + // When ranges are from 0 ~ 65535, use MULU16 mode. + else if (AllPositive && MinSignBits >= 16) + Mode = MULU16; + else + return false; + return true; +} + +/// When the operands of vector mul are extended from smaller size values, +/// like i8 and i16, the type of mul may be shrinked to generate more +/// efficient code. Two typical patterns are handled: +/// Pattern1: +/// %2 = sext/zext %1 to +/// %4 = sext/zext %3 to +// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul %2, %4 +/// +/// Pattern2: +/// %2 = zext/sext %1 to +/// %4 = zext/sext %3 to +/// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul %2, %4 +/// +/// There are four mul shrinking modes: +/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// -128 to 128, and the scalar value range of %4 is also -128 to 128, +/// generate pmullw+sext32 for it (MULS8 mode). +/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// 0 to 255, and the scalar value range of %4 is also 0 to 255, +/// generate pmullw+zext32 for it (MULU8 mode). +/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, +/// generate pmullw+pmulhw for it (MULS16 mode). +/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, +/// generate pmullw+pmulhuw for it (MULU16 mode). +static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // pmulld is supported since SSE41. It is better to use pmulld + // instead of pmullw+pmulhw. + if (Subtarget.hasSSE41()) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(N, DAG, Mode)) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getOperand(0).getValueType(); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); + EVT ReducedVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); + // Shrink the operands of mul. + SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); + SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); + + if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) { + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) { + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); + } else { + MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector ShuffleMask(VT.getVectorNumElements()); + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements(); + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]); + ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]); + ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + } else { + // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want + // to legalize the mul explicitly because implicit legalization for type + // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack + // instructions which will not exist when we explicitly legalize it by + // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with + // <4 x i16> undef). + // + // Legalize the operands of mul. + SmallVector Ops(RegSize / ReducedVT.getSizeInBits(), + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (Mode == MULU8 || Mode == MULS8) { + // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower + // part is needed. + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // convert the type of mul result to VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } else { + // Generate the lower and higher part of mul: pmulhw/pmulhuw. For + // MULU16/MULS16, both parts are needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + OpsVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. Make sure the type of mul result is VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + } +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalize() && VT.isVector()) + return reduceVMULWidth(N, DAG, Subtarget); + // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -26973,7 +27179,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); @@ -30268,7 +30473,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::ADC: return combineADC(N, DAG, DCI); - case ISD::MUL: return combineMul(N, DAG, DCI); + case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: case ISD::SRA: case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll new file mode 100644 index 00000000000..185bf9b75d2 --- /dev/null +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -0,0 +1,864 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s + +@c = external global i32*, align 8 + +; %val1 = load <2 x i8> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i8> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i8> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_4xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i8> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i8> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_8xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i8> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i8> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; CHECK-NEXT: movdqa %xmm1, %xmm4 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm3 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i16> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i16> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_4xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 + %tmp8 = zext <4 x i16> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i16>* + %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 + %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i16> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i16> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_8xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 + %tmp8 = zext <8 x i16> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 + %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pmulhuw %xmm0, %xmm4 +; CHECK-NEXT: pmullw %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pmulhuw %xmm1, %xmm4 +; CHECK-NEXT: pmullw %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = zext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm1 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8_sext_zext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16_sext_zext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: psrlq $32, %xmm3 +; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: psllq $32, %xmm3 +; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: psrlq $32, %xmm1 +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: psllq $32, %xmm1 +; CHECK-NEXT: paddq %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = sext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = sext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi16_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pmulhw %xmm0, %xmm4 +; CHECK-NEXT: pmullw %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pmulhw %xmm1, %xmm4 +; CHECK-NEXT: pmullw %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = sext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst5: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst6: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhuw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 +; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: psllq $32, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 +; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: psllq $32, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} -- 2.50.1