ret <8 x double> %res
}
+define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_mul_epi32_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mul_epi32_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mul_epi32_rmb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rmbk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mul_epi32_rmbkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
+
define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epi32_rr:
; CHECK: ## BB#0:
declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_mul_epu32_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mul_epu32_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %b = load <16 x i32>, <16 x i32>* %ptr_b
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mul_epu32_rmb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rmbk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mul_epu32_rmbkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+ %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %b = bitcast <8 x i64> %b64 to <16 x i32>
+ %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
+
define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epu32_rr:
; CHECK: ## BB#0: