From b508009134c1349367df52d96bf8d8db6e7f7247 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 15 Jul 2019 17:50:31 +0000
Subject: [PATCH] AMDGPU: Add 24-bit mul intrinsics

Insert these during codegenprepare.

This works around a DAG issue where generic combines eliminate the and
asserting the high bits are zero, which then exposes an unknown read
source to the mul combine. It doesn't worth the hassle of trying to
insert an AssertZext or something to try to deal with it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366094 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsAMDGPU.td           |  10 +
 lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 127 +++++
 lib/Target/AMDGPU/SIISelLowering.cpp          |   5 +
 .../AMDGPU/amdgpu-codegenprepare-mul24.ll     | 494 ++++++++++++++++++
 test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll    |  14 +
 test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll    |  14 +
 test/CodeGen/AMDGPU/mad_uint24.ll             |  76 +++
 test/CodeGen/AMDGPU/mul.i16.ll                |  18 +-
 test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll      |   4 +-
 9 files changed, 751 insertions(+), 11 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll

diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 43e827ec6ab..e92a6078ce4 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1350,6 +1350,16 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
 
+def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
 // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
 //
 // bar_val is the total number of waves that will wait on this
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 75982075325..b750c6b5f6d 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -61,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   AssumptionCache *AC = nullptr;
   LegacyDivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
 
   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
@@ -133,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 
+
+  unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+  unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
+  bool isI24(Value *V, unsigned ScalarSize) const;
+  bool isU24(Value *V, unsigned ScalarSize) const;
+
+  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
+  /// SelectionDAG has an issue where an and asserting the bits are known
+  bool replaceMulWithMul24(BinaryOperator &I) const;
+
   /// Expands 24 bit div or rem.
   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den,
@@ -392,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
   return true;
 }
 
+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
+                                               unsigned ScalarSize) const {
+  KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
+  return ScalarSize - Known.countMinLeadingZeros();
+}
+
+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
+                                             unsigned ScalarSize) const {
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
+}
+
+bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
+  return ScalarSize >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+    numBitsSigned(V, ScalarSize) < 24;
+}
+
+bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
+  return numBitsUnsigned(V, ScalarSize) <= 24;
+}
+
+static void extractValues(IRBuilder<> &Builder,
+                          SmallVectorImpl<Value *> &Values, Value *V) {
+  VectorType *VT = dyn_cast<VectorType>(V->getType());
+  if (!VT) {
+    Values.push_back(V);
+    return;
+  }
+
+  for (int I = 0, E = VT->getNumElements(); I != E; ++I)
+    Values.push_back(Builder.CreateExtractElement(V, I));
+}
+
+static Value *insertValues(IRBuilder<> &Builder,
+                           Type *Ty,
+                           SmallVectorImpl<Value *> &Values) {
+  if (Values.size() == 1)
+    return Values[0];
+
+  Value *NewVal = UndefValue::get(Ty);
+  for (int I = 0, E = Values.size(); I != E; ++I)
+    NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
+
+  return NewVal;
+}
+
+bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
+  if (I.getOpcode() != Instruction::Mul)
+    return false;
+
+  Type *Ty = I.getType();
+  unsigned Size = Ty->getScalarSizeInBits();
+  if (Size <= 16 && ST->has16BitInsts())
+    return false;
+
+  // Prefer scalar if this could be s_mul_i32
+  if (DA->isUniform(&I))
+    return false;
+
+  Value *LHS = I.getOperand(0);
+  Value *RHS = I.getOperand(1);
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+
+  // TODO: Should this try to match mulhi24?
+  if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
+    IntrID = Intrinsic::amdgcn_mul_u24;
+  } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
+    IntrID = Intrinsic::amdgcn_mul_i24;
+  } else
+    return false;
+
+  SmallVector<Value *, 4> LHSVals;
+  SmallVector<Value *, 4> RHSVals;
+  SmallVector<Value *, 4> ResultVals;
+  extractValues(Builder, LHSVals, LHS);
+  extractValues(Builder, RHSVals, RHS);
+
+
+  IntegerType *I32Ty = Builder.getInt32Ty();
+  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
+  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
+    Value *LHS, *RHS;
+    if (IntrID == Intrinsic::amdgcn_mul_u24) {
+      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
+    } else {
+      LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
+      RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+    }
+
+    Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+
+    if (IntrID == Intrinsic::amdgcn_mul_u24) {
+      ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
+                                                     LHSVals[I]->getType()));
+    } else {
+      ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
+                                                     LHSVals[I]->getType()));
+    }
+  }
+
+  I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+  I.eraseFromParent();
+
+  return true;
+}
+
 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
@@ -756,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
       DA->isUniform(&I) && promoteUniformOpToI32(I))
     return true;
 
+  if (replaceMulWithMul24(I))
+    return true;
+
   bool Changed = false;
   Instruction::BinaryOps Opc = I.getOpcode();
   Type *Ty = I.getType();
@@ -882,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
 
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
+  DL = &Mod->getDataLayout();
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index b90a0d28e9e..a3226577cd0 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5836,6 +5836,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_cos:
     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
 
+  case Intrinsic::amdgcn_mul_u24:
+    return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_mul_i24:
+    return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::amdgcn_log_clamp: {
     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return SDValue();
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
new file mode 100644
index 00000000000..cda1da825f9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
@@ -0,0 +1,494 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
+
+define i16 @mul_i16(i16 %lhs, i16 %rhs) {
+; SI-LABEL: @mul_i16(
+; SI-NEXT:    [[TMP1:%.*]] = zext i16 [[LHS:%.*]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = zext i16 [[RHS:%.*]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; SI-NEXT:    ret i16 [[TMP4]]
+;
+; VI-LABEL: @mul_i16(
+; VI-NEXT:    [[MUL:%.*]] = mul i16 [[LHS:%.*]], [[RHS:%.*]]
+; VI-NEXT:    ret i16 [[MUL]]
+;
+  %mul = mul i16 %lhs, %rhs
+  ret i16 %mul
+}
+
+define i32 @smul24_i32(i32 %lhs, i32 %rhs) {
+; SI-LABEL: @smul24_i32(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 8
+; SI-NEXT:    [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8
+; SI-NEXT:    [[RHS24:%.*]] = ashr i32 [[LHS]], 8
+; SI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
+; SI-NEXT:    ret i32 [[TMP1]]
+;
+; VI-LABEL: @smul24_i32(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 8
+; VI-NEXT:    [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8
+; VI-NEXT:    [[RHS24:%.*]] = ashr i32 [[LHS]], 8
+; VI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
+; VI-NEXT:    ret i32 [[TMP1]]
+;
+  %shl.lhs = shl i32 %lhs, 8
+  %lhs24 = ashr i32 %shl.lhs, 8
+  %lshr.rhs = shl i32 %rhs, 8
+  %rhs24 = ashr i32 %lhs, 8
+  %mul = mul i32 %lhs24, %rhs24
+  ret i32 %mul
+}
+
+define <2 x i32> @smul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; SI-LABEL: @smul24_v2i32(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i32> [[LHS:%.*]], <i32 8, i32 8>
+; SI-NEXT:    [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]], <i32 8, i32 8>
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i32> [[RHS:%.*]], <i32 8, i32 8>
+; SI-NEXT:    [[RHS24:%.*]] = ashr <2 x i32> [[LHS]], <i32 8, i32 8>
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
+; SI-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
+; SI-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
+; SI-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
+; SI-NEXT:    ret <2 x i32> [[TMP8]]
+;
+; VI-LABEL: @smul24_v2i32(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i32> [[LHS:%.*]], <i32 8, i32 8>
+; VI-NEXT:    [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]], <i32 8, i32 8>
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i32> [[RHS:%.*]], <i32 8, i32 8>
+; VI-NEXT:    [[RHS24:%.*]] = ashr <2 x i32> [[LHS]], <i32 8, i32 8>
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
+; VI-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
+; VI-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
+; VI-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
+; VI-NEXT:    ret <2 x i32> [[TMP8]]
+;
+  %shl.lhs = shl <2 x i32> %lhs, <i32 8, i32 8>
+  %lhs24 = ashr <2 x i32> %shl.lhs, <i32 8, i32 8>
+  %lshr.rhs = shl <2 x i32> %rhs, <i32 8, i32 8>
+  %rhs24 = ashr <2 x i32> %lhs, <i32 8, i32 8>
+  %mul = mul <2 x i32> %lhs24, %rhs24
+  ret <2 x i32> %mul
+}
+
+define i32 @umul24_i32(i32 %lhs, i32 %rhs) {
+; SI-LABEL: @umul24_i32(
+; SI-NEXT:    [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215
+; SI-NEXT:    [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215
+; SI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
+; SI-NEXT:    ret i32 [[TMP1]]
+;
+; VI-LABEL: @umul24_i32(
+; VI-NEXT:    [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215
+; VI-NEXT:    [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215
+; VI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
+; VI-NEXT:    ret i32 [[TMP1]]
+;
+  %lhs24 = and i32 %lhs, 16777215
+  %rhs24 = and i32 %rhs, 16777215
+  %mul = mul i32 %lhs24, %rhs24
+  ret i32 %mul
+}
+
+define <2 x i32> @umul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; SI-LABEL: @umul24_v2i32(
+; SI-NEXT:    [[LHS24:%.*]] = and <2 x i32> [[LHS:%.*]], <i32 16777215, i32 16777215>
+; SI-NEXT:    [[RHS24:%.*]] = and <2 x i32> [[RHS:%.*]], <i32 16777215, i32 16777215>
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
+; SI-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
+; SI-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
+; SI-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
+; SI-NEXT:    ret <2 x i32> [[TMP8]]
+;
+; VI-LABEL: @umul24_v2i32(
+; VI-NEXT:    [[LHS24:%.*]] = and <2 x i32> [[LHS:%.*]], <i32 16777215, i32 16777215>
+; VI-NEXT:    [[RHS24:%.*]] = and <2 x i32> [[RHS:%.*]], <i32 16777215, i32 16777215>
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
+; VI-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
+; VI-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
+; VI-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
+; VI-NEXT:    ret <2 x i32> [[TMP8]]
+;
+  %lhs24 = and <2 x i32> %lhs, <i32 16777215, i32 16777215>
+  %rhs24 = and <2 x i32> %rhs, <i32 16777215, i32 16777215>
+  %mul = mul <2 x i32> %lhs24, %rhs24
+  ret <2 x i32> %mul
+}
+
+define i64 @smul24_i64(i64 %lhs, i64 %rhs) {
+; SI-LABEL: @smul24_i64(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl i64 [[LHS:%.*]], 40
+; SI-NEXT:    [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
+; SI-NEXT:    [[RHS24:%.*]] = ashr i64 [[LHS]], 40
+; SI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; SI-NEXT:    ret i64 [[TMP4]]
+;
+; VI-LABEL: @smul24_i64(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl i64 [[LHS:%.*]], 40
+; VI-NEXT:    [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
+; VI-NEXT:    [[RHS24:%.*]] = ashr i64 [[LHS]], 40
+; VI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; VI-NEXT:    ret i64 [[TMP4]]
+;
+  %shl.lhs = shl i64 %lhs, 40
+  %lhs24 = ashr i64 %shl.lhs, 40
+  %lshr.rhs = shl i64 %rhs, 40
+  %rhs24 = ashr i64 %lhs, 40
+  %mul = mul i64 %lhs24, %rhs24
+  ret i64 %mul
+}
+
+define i64 @umul24_i64(i64 %lhs, i64 %rhs) {
+; SI-LABEL: @umul24_i64(
+; SI-NEXT:    [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
+; SI-NEXT:    [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
+; SI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    ret i64 [[TMP4]]
+;
+; VI-LABEL: @umul24_i64(
+; VI-NEXT:    [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
+; VI-NEXT:    [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
+; VI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    ret i64 [[TMP4]]
+;
+  %lhs24 = and i64 %lhs, 16777215
+  %rhs24 = and i64 %rhs, 16777215
+  %mul = mul i64 %lhs24, %rhs24
+  ret i64 %mul
+}
+
+define i31 @smul24_i31(i31 %lhs, i31 %rhs) {
+; SI-LABEL: @smul24_i31(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl i31 [[LHS:%.*]], 7
+; SI-NEXT:    [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i31 [[RHS:%.*]], 7
+; SI-NEXT:    [[RHS24:%.*]] = ashr i31 [[LHS]], 7
+; SI-NEXT:    [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
+; SI-NEXT:    ret i31 [[TMP4]]
+;
+; VI-LABEL: @smul24_i31(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl i31 [[LHS:%.*]], 7
+; VI-NEXT:    [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i31 [[RHS:%.*]], 7
+; VI-NEXT:    [[RHS24:%.*]] = ashr i31 [[LHS]], 7
+; VI-NEXT:    [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
+; VI-NEXT:    ret i31 [[TMP4]]
+;
+  %shl.lhs = shl i31 %lhs, 7
+  %lhs24 = ashr i31 %shl.lhs, 7
+  %lshr.rhs = shl i31 %rhs, 7
+  %rhs24 = ashr i31 %lhs, 7
+  %mul = mul i31 %lhs24, %rhs24
+  ret i31 %mul
+}
+
+define i31 @umul24_i31(i31 %lhs, i31 %rhs) {
+; SI-LABEL: @umul24_i31(
+; SI-NEXT:    [[LHS24:%.*]] = and i31 [[LHS:%.*]], 16777215
+; SI-NEXT:    [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215
+; SI-NEXT:    [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
+; SI-NEXT:    ret i31 [[TMP4]]
+;
+; VI-LABEL: @umul24_i31(
+; VI-NEXT:    [[LHS24:%.*]] = and i31 [[LHS:%.*]], 16777215
+; VI-NEXT:    [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215
+; VI-NEXT:    [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
+; VI-NEXT:    ret i31 [[TMP4]]
+;
+  %lhs24 = and i31 %lhs, 16777215
+  %rhs24 = and i31 %rhs, 16777215
+  %mul = mul i31 %lhs24, %rhs24
+  ret i31 %mul
+}
+
+define <2 x i31> @umul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
+; SI-LABEL: @umul24_v2i31(
+; SI-NEXT:    [[LHS24:%.*]] = and <2 x i31> [[LHS:%.*]], <i31 16777215, i31 16777215>
+; SI-NEXT:    [[RHS24:%.*]] = and <2 x i31> [[RHS:%.*]], <i31 16777215, i31 16777215>
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
+; SI-NEXT:    [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
+; SI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
+; SI-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
+; SI-NEXT:    [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
+; SI-NEXT:    [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
+; SI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
+; SI-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
+; SI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
+; SI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
+; SI-NEXT:    ret <2 x i31> [[TMP14]]
+;
+; VI-LABEL: @umul24_v2i31(
+; VI-NEXT:    [[LHS24:%.*]] = and <2 x i31> [[LHS:%.*]], <i31 16777215, i31 16777215>
+; VI-NEXT:    [[RHS24:%.*]] = and <2 x i31> [[RHS:%.*]], <i31 16777215, i31 16777215>
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
+; VI-NEXT:    [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
+; VI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
+; VI-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
+; VI-NEXT:    [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
+; VI-NEXT:    [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
+; VI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
+; VI-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
+; VI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
+; VI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
+; VI-NEXT:    ret <2 x i31> [[TMP14]]
+;
+  %lhs24 = and <2 x i31> %lhs, <i31 16777215, i31 16777215>
+  %rhs24 = and <2 x i31> %rhs, <i31 16777215, i31 16777215>
+  %mul = mul <2 x i31> %lhs24, %rhs24
+  ret <2 x i31> %mul
+}
+
+define <2 x i31> @smul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
+; SI-LABEL: @smul24_v2i31(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i31> [[LHS:%.*]], <i31 8, i31 8>
+; SI-NEXT:    [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]], <i31 8, i31 8>
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i31> [[RHS:%.*]], <i31 8, i31 8>
+; SI-NEXT:    [[RHS24:%.*]] = ashr <2 x i31> [[LHS]], <i31 8, i31 8>
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
+; SI-NEXT:    [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
+; SI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; SI-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
+; SI-NEXT:    [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
+; SI-NEXT:    [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
+; SI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
+; SI-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
+; SI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
+; SI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
+; SI-NEXT:    ret <2 x i31> [[TMP14]]
+;
+; VI-LABEL: @smul24_v2i31(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i31> [[LHS:%.*]], <i31 8, i31 8>
+; VI-NEXT:    [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]], <i31 8, i31 8>
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i31> [[RHS:%.*]], <i31 8, i31 8>
+; VI-NEXT:    [[RHS24:%.*]] = ashr <2 x i31> [[LHS]], <i31 8, i31 8>
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
+; VI-NEXT:    [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
+; VI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; VI-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
+; VI-NEXT:    [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
+; VI-NEXT:    [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
+; VI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
+; VI-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
+; VI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
+; VI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
+; VI-NEXT:    ret <2 x i31> [[TMP14]]
+;
+  %shl.lhs = shl <2 x i31> %lhs, <i31 8, i31 8>
+  %lhs24 = ashr <2 x i31> %shl.lhs, <i31 8, i31 8>
+  %lshr.rhs = shl <2 x i31> %rhs, <i31 8, i31 8>
+  %rhs24 = ashr <2 x i31> %lhs, <i31 8, i31 8>
+  %mul = mul <2 x i31> %lhs24, %rhs24
+  ret <2 x i31> %mul
+}
+
+define i33 @smul24_i33(i33 %lhs, i33 %rhs) {
+; SI-LABEL: @smul24_i33(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl i33 [[LHS:%.*]], 9
+; SI-NEXT:    [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
+; SI-NEXT:    [[RHS24:%.*]] = ashr i33 [[LHS]], 9
+; SI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
+; SI-NEXT:    ret i33 [[TMP4]]
+;
+; VI-LABEL: @smul24_i33(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl i33 [[LHS:%.*]], 9
+; VI-NEXT:    [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
+; VI-NEXT:    [[RHS24:%.*]] = ashr i33 [[LHS]], 9
+; VI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
+; VI-NEXT:    ret i33 [[TMP4]]
+;
+  %shl.lhs = shl i33 %lhs, 9
+  %lhs24 = ashr i33 %shl.lhs, 9
+  %lshr.rhs = shl i33 %rhs, 9
+  %rhs24 = ashr i33 %lhs, 9
+  %mul = mul i33 %lhs24, %rhs24
+  ret i33 %mul
+}
+
+define i33 @umul24_i33(i33 %lhs, i33 %rhs) {
+; SI-LABEL: @umul24_i33(
+; SI-NEXT:    [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
+; SI-NEXT:    [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
+; SI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
+; SI-NEXT:    ret i33 [[TMP4]]
+;
+; VI-LABEL: @umul24_i33(
+; VI-NEXT:    [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
+; VI-NEXT:    [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
+; VI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
+; VI-NEXT:    ret i33 [[TMP4]]
+;
+  %lhs24 = and i33 %lhs, 16777215
+  %rhs24 = and i33 %rhs, 16777215
+  %mul = mul i33 %lhs24, %rhs24
+  ret i33 %mul
+}
+
+define i32 @smul25_i32(i32 %lhs, i32 %rhs) {
+; SI-LABEL: @smul25_i32(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 7
+; SI-NEXT:    [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 7
+; SI-NEXT:    [[RHS24:%.*]] = ashr i32 [[LHS]], 7
+; SI-NEXT:    [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
+; SI-NEXT:    ret i32 [[MUL]]
+;
+; VI-LABEL: @smul25_i32(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 7
+; VI-NEXT:    [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 7
+; VI-NEXT:    [[RHS24:%.*]] = ashr i32 [[LHS]], 7
+; VI-NEXT:    [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
+; VI-NEXT:    ret i32 [[MUL]]
+;
+  %shl.lhs = shl i32 %lhs, 7
+  %lhs24 = ashr i32 %shl.lhs, 7
+  %lshr.rhs = shl i32 %rhs, 7
+  %rhs24 = ashr i32 %lhs, 7
+  %mul = mul i32 %lhs24, %rhs24
+  ret i32 %mul
+}
+
+define i32 @umul25_i32(i32 %lhs, i32 %rhs) {
+; SI-LABEL: @umul25_i32(
+; SI-NEXT:    [[LHS24:%.*]] = and i32 [[LHS:%.*]], 33554431
+; SI-NEXT:    [[RHS24:%.*]] = and i32 [[RHS:%.*]], 33554431
+; SI-NEXT:    [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
+; SI-NEXT:    ret i32 [[MUL]]
+;
+; VI-LABEL: @umul25_i32(
+; VI-NEXT:    [[LHS24:%.*]] = and i32 [[LHS:%.*]], 33554431
+; VI-NEXT:    [[RHS24:%.*]] = and i32 [[RHS:%.*]], 33554431
+; VI-NEXT:    [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
+; VI-NEXT:    ret i32 [[MUL]]
+;
+  %lhs24 = and i32 %lhs, 33554431
+  %rhs24 = and i32 %rhs, 33554431
+  %mul = mul i32 %lhs24, %rhs24
+  ret i32 %mul
+}
+
+define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) {
+; SI-LABEL: @smul24_v2i33(
+; SI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i33> [[LHS:%.*]], <i33 9, i33 9>
+; SI-NEXT:    [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
+; SI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], <i33 9, i33 9>
+; SI-NEXT:    [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
+; SI-NEXT:    [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
+; SI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; SI-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
+; SI-NEXT:    [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
+; SI-NEXT:    [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
+; SI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
+; SI-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
+; SI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
+; SI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
+; SI-NEXT:    ret <2 x i33> [[TMP14]]
+;
+; VI-LABEL: @smul24_v2i33(
+; VI-NEXT:    [[SHL_LHS:%.*]] = shl <2 x i33> [[LHS:%.*]], <i33 9, i33 9>
+; VI-NEXT:    [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
+; VI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], <i33 9, i33 9>
+; VI-NEXT:    [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
+; VI-NEXT:    [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
+; VI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; VI-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
+; VI-NEXT:    [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
+; VI-NEXT:    [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
+; VI-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
+; VI-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
+; VI-NEXT:    [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
+; VI-NEXT:    [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
+; VI-NEXT:    ret <2 x i33> [[TMP14]]
+;
+  %shl.lhs = shl <2 x i33> %lhs, <i33 9, i33 9>
+  %lhs24 = ashr <2 x i33> %shl.lhs, <i33 9, i33 9>
+  %lshr.rhs = shl <2 x i33> %rhs, <i33 9, i33 9>
+  %rhs24 = ashr <2 x i33> %lhs, <i33 9, i33 9>
+  %mul = mul <2 x i33> %lhs24, %rhs24
+  ret <2 x i33> %mul
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
new file mode 100644
index 00000000000..a1dbe9a1322
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_mul_i24:
+; GCN: v_mul_i32_i24
+define amdgpu_kernel void @test_mul_i24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
+  %val = call i32 @llvm.amdgcn.mul.i24(i32 %src1, i32 %src2) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mul.i24(i32, i32) #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
new file mode 100644
index 00000000000..810b50337e2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_mul_u24:
+; GCN: v_mul_u32_u24
+define amdgpu_kernel void @test_mul_u24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
+  %val = call i32 @llvm.amdgcn.mul.u24(i32 %src1, i32 %src2) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mul.u24(i32, i32) #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 5f109624daf..7c7b5925adf 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -233,3 +233,79 @@ entry:
   store i64 %mad_ext, i64 addrspace(1)* %out
   ret void
 }
+
+; The ands are asserting the high bits are 0. SimplifyDemandedBits on
+; the adds would remove the ands before the target combine on the mul
+; had a chance to form mul24. The mul combine would then see
+; extractelement with no known bits and fail. All of the mul/add
+; combos in this loop should form v_mad_u32_u24.
+
+; FUNC-LABEL: {{^}}mad24_known_bits_destroyed:
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+; GCN: v_mad_u32_u24
+define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 addrspace(1)* %arg7, <4 x i32> addrspace(1)* %arg8) #0 {
+bb:
+  %tmp = and i32 %arg4, 16777215
+  %tmp9 = extractelement <4 x i32> %arg1, i64 1
+  %tmp10 = extractelement <4 x i32> %arg3, i64 1
+  %tmp11 = and i32 %tmp9, 16777215
+  %tmp12 = extractelement <4 x i32> %arg1, i64 2
+  %tmp13 = extractelement <4 x i32> %arg3, i64 2
+  %tmp14 = and i32 %tmp12, 16777215
+  %tmp15 = extractelement <4 x i32> %arg1, i64 3
+  %tmp16 = extractelement <4 x i32> %arg3, i64 3
+  %tmp17 = and i32 %tmp15, 16777215
+  br label %bb19
+
+bb18:                                             ; preds = %bb19
+  ret void
+
+bb19:                                             ; preds = %bb19, %bb
+  %tmp20 = phi i32 [ %arg, %bb ], [ %tmp40, %bb19 ]
+  %tmp21 = phi i32 [ 0, %bb ], [ %tmp54, %bb19 ]
+  %tmp22 = phi <4 x i32> [ %arg2, %bb ], [ %tmp53, %bb19 ]
+  %tmp23 = and i32 %tmp20, 16777215
+  %tmp24 = mul i32 %tmp23, %tmp
+  %tmp25 = add i32 %tmp24, %arg5
+  %tmp26 = extractelement <4 x i32> %tmp22, i64 1
+  %tmp27 = and i32 %tmp26, 16777215
+  %tmp28 = mul i32 %tmp27, %tmp11
+  %tmp29 = add i32 %tmp28, %tmp10
+  %tmp30 = extractelement <4 x i32> %tmp22, i64 2
+  %tmp31 = and i32 %tmp30, 16777215
+  %tmp32 = mul i32 %tmp31, %tmp14
+  %tmp33 = add i32 %tmp32, %tmp13
+  %tmp34 = extractelement <4 x i32> %tmp22, i64 3
+  %tmp35 = and i32 %tmp34, 16777215
+  %tmp36 = mul i32 %tmp35, %tmp17
+  %tmp37 = add i32 %tmp36, %tmp16
+  %tmp38 = and i32 %tmp25, 16777215
+  %tmp39 = mul i32 %tmp38, %tmp
+  %tmp40 = add i32 %tmp39, %arg5
+  store i32 %tmp40, i32 addrspace(1)* %arg7
+  %tmp41 = insertelement <4 x i32> undef, i32 %tmp40, i32 0
+  %tmp42 = and i32 %tmp29, 16777215
+  %tmp43 = mul i32 %tmp42, %tmp11
+  %tmp44 = add i32 %tmp43, %tmp10
+  %tmp45 = insertelement <4 x i32> %tmp41, i32 %tmp44, i32 1
+  %tmp46 = and i32 %tmp33, 16777215
+  %tmp47 = mul i32 %tmp46, %tmp14
+  %tmp48 = add i32 %tmp47, %tmp13
+  %tmp49 = insertelement <4 x i32> %tmp45, i32 %tmp48, i32 2
+  %tmp50 = and i32 %tmp37, 16777215
+  %tmp51 = mul i32 %tmp50, %tmp17
+  %tmp52 = add i32 %tmp51, %tmp16
+  %tmp53 = insertelement <4 x i32> %tmp49, i32 %tmp52, i32 3
+  store <4 x i32> %tmp53, <4 x i32> addrspace(1)* %arg8
+  %tmp54 = add nuw nsw i32 %tmp21, 1
+  %tmp55 = icmp eq i32 %tmp54, %arg6
+  br i1 %tmp55, label %bb18, label %bb19
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/CodeGen/AMDGPU/mul.i16.ll b/test/CodeGen/AMDGPU/mul.i16.ll
index f67f17ad78a..48619055c8e 100644
--- a/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/test/CodeGen/AMDGPU/mul.i16.ll
@@ -41,8 +41,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}v_mul_v2i16:
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
 
 ; VI: v_mul_lo_u16_sdwa
 ; VI: v_mul_lo_u16_e32
@@ -59,9 +59,9 @@ define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
 
 ; FIXME: Unpack garbage on gfx9
 ; GCN-LABEL: {{^}}v_mul_v3i16:
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
 
 ; VI: v_mul_lo_u16
 ; VI: v_mul_lo_u16
@@ -77,10 +77,10 @@ define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {
 }
 
 ; GCN-LABEL: {{^}}v_mul_v4i16:
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
-; SI: v_mul_lo_u32
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
+; SI: v_mul_u32_u24
 
 ; VI: v_mul_lo_u16_sdwa
 ; VI: v_mul_lo_u16_e32
diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 019e2b51ce1..3ced4708580 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -249,8 +249,8 @@ entry:
 ; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1
 ; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
 ; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
-; GCN: v_and_b32_e32 v1, s4, v1
-; GCN: v_and_b32_e32 v0, 0x7ffffe, v0
+; GCN-DAG: v_and_b32_e32 v1, s4, v1
+; GCN-DAG: v_and_b32_e32 v0, 0x7ffffe, v0
 ; GCN: v_mul_u32_u24_e32 v0, v0, v1
 ; GCN: v_and_b32_e32 v0, 0x1fffe, v0
 ; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
-- 
2.40.0