From: Craig Topper Date: Mon, 20 Feb 2017 02:47:42 +0000 (+0000) Subject: [AVX-512] Add more VPTERNLOG patterns to enable folding of broadcast loads that aren... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5686a0d2ba72c4a81ac710953b32adebb8578ced;p=llvm [AVX-512] Add more VPTERNLOG patterns to enable folding of broadcast loads that aren't in operand 2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295634 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 3c62d16d42e..484ace999ce 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -8992,6 +8992,45 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, _.RC:$src1)), (!cast(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching broadcasts in other positions. + def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + (!cast(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4))), + (!cast(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching zero masking with broadcasts in other + // positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast(NAME#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast(NAME#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching masked broadcasts with different + // operand orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; } multiclass avx512_common_ternlog{ diff --git a/test/CodeGen/X86/avx512-vpternlog-commute.ll b/test/CodeGen/X86/avx512-vpternlog-commute.ll index 988b43d403b..12b21762e78 100644 --- a/test/CodeGen/X86/avx512-vpternlog-commute.ll +++ b/test/CodeGen/X86/avx512-vpternlog-commute.ll @@ -521,8 +521,7 @@ define <16 x i32> @vpternlog_v16i32_021_load2_maskz(<16 x i32> %x0, <16 x i32> % define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_012_broadcast0: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x0_scalar = load i32, i32* %ptr_x0 %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0 @@ -534,8 +533,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1, define <16 x i32> @vpternlog_v16i32_012_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_012_broadcast1: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 @@ -559,8 +557,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast2(<16 x i32> %x0, <16 x i32> %x define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_102_broadcast0: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x0_scalar = load i32, i32* %ptr_x0 %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0 @@ -572,8 +569,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1, define <16 x i32> @vpternlog_v16i32_102_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_102_broadcast1: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 @@ -609,9 +605,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast0(i32* %ptr_x0, <16 x i32> %x1, define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_210_broadcast1: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 @@ -623,8 +617,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1, define <16 x i32> @vpternlog_v16i32_210_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) { ; CHECK-LABEL: vpternlog_v16i32_210_broadcast2: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $33, (%rdi){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %x2_scalar = load i32, i32* %ptr_x2 %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0 @@ -652,8 +645,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask(<16 x i32> %x0, i32* %x1 ; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 @@ -679,8 +671,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0_mask(i32* %x0ptr, <16 x i32> ; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 @@ -736,8 +727,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1_mask(<16 x i32> %x0, i32* %x1 ; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr @@ -794,8 +784,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast2_mask(<16 x i32> %x0, <16 x i3 ; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 @@ -808,8 +797,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0_maskz(i32* %x0ptr, <16 x i32> ; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $9, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 @@ -822,8 +810,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1_maskz(<16 x i32> %x0, i32* %x ; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 @@ -849,8 +836,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0_maskz(i32* %x0ptr, <16 x i32> ; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 @@ -863,8 +849,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast1_maskz(<16 x i32> %x0, i32* %x ; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $9, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 @@ -903,9 +888,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1_maskz(<16 x i32> %x0, i32* %x ; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm1, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 @@ -918,8 +901,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast2_maskz(<16 x i32> %x0, <16 x i ; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $33, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 @@ -932,8 +914,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast0_maskz(i32* %x0ptr, <16 x i32> ; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $33, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 @@ -959,8 +940,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast2_maskz(<16 x i32> %x0, <16 x i ; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0