ret <8 x i64> %1
}
+;
+; Constant Folding
+;
+
+define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_128(
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 undef, i32 -1, i32 undef>, <4 x i32> <i32 2147483647, i32 undef, i32 1, i32 undef>)
+; CHECK-NEXT: ret <2 x i64> [[TMP1]]
+;
+ %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_256(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
+; CHECK-NEXT: ret <4 x i64> [[TMP1]]
+;
+ %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
+ ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 -1, i32 undef, i32 65536, i32 undef, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 255, i32 undef, i32 65535, i32 undef, i32 0, i32 undef, i32 -65535, i32 undef, i32 2147483647, i32 undef, i32 65536, i32 undef>)
+; CHECK-NEXT: ret <8 x i64> [[TMP1]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+ ret <8 x i64> %1
+}
+
+define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_128(
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 undef, i32 -1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 -2, i32 undef>)
+; CHECK-NEXT: ret <2 x i64> [[TMP1]]
+;
+ %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_256(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 undef, i32 -65535, i32 undef, i32 65536, i32 undef, i32 -2147483648, i32 undef>, <8 x i32> <i32 0, i32 undef, i32 -65535, i32 undef, i32 2147483647, i32 undef, i32 65536, i32 undef>)
+; CHECK-NEXT: ret <4 x i64> [[TMP1]]
+;
+ %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+ ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 -3, i32 undef, i32 8, i32 undef, i32 -256, i32 undef, i32 undef, i32 undef, i32 -65535, i32 undef, i32 65536, i32 undef, i32 -2147483648, i32 undef>)
+; CHECK-NEXT: ret <8 x i64> [[TMP1]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
+ ret <8 x i64> %1
+}
+
;
; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
;