From: Craig Topper Date: Wed, 30 Aug 2017 07:48:39 +0000 (+0000) Subject: [AVX512] Correct isel patterns to support selecting masked vbroadcastf32x2/vbroadcast... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e841c283740a653e3307b4ab3f88ad6774c47915;p=llvm [AVX512] Correct isel patterns to support selecting masked vbroadcastf32x2/vbroadcasti32x2 Summary: This patch adjusts the patterns to make the result type of the broadcast node vXf64/vXi64. Then adds a bitcast to vXi32 after that. Intrinsic lowering was also adjusted to generate this new pattern. Fixes PR34357 We should probably just drop the intrinsic entirely and use native IR, but I'll leave that for a future patch. Any idea what instruction we should be lowering the floating point 128-bit result version of this pattern to? There's a 128-bit v2i32 integer broadcast but not an fp one. Reviewers: aymanmus, zvi, igorb Reviewed By: aymanmus Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D37286 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312101 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 58761f4676e..e2862f6200c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19916,9 +19916,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64; MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64); Src = DAG.getBitcast(BitcastVT, Src); + MVT ResVT = MVT::getVectorVT(ScalarVT, VT.getSizeInBits()/64); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, ResVT, Src); + Res = DAG.getBitcast(VT, Res); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(Res, Mask, PassThru, Subtarget, DAG); } default: break; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index e85ce630535..34bce9bf5a0 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1086,39 +1086,60 @@ multiclass avx512_broadcast_scalar opc, string OpcodeStr, DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; } -multiclass avx512_broadcast_rm opc, string OpcodeStr, - X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { +// Split version to allow mask and broadcast node to be different types. This +// helps support the 32x2 broadcasts. +multiclass avx512_broadcast_rm_split opc, string OpcodeStr, + X86VectorVTInfo MaskInfo, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { let ExeDomain = DestInfo.ExeDomain in { - defm r : AVX512_maskable, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>, T8PD, EVEX; - defm m : AVX512_maskable, + (MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))))>, T8PD, EVEX, EVEX_CD8; } - def : Pat<(DestInfo.VT (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src))))), - (!cast(NAME#DestInfo.ZSuffix#m) addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.RC:$src0)), + def : Pat<(MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src))))))), + (!cast(NAME#MaskInfo.ZSuffix#m) addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.RC:$src0)), (!cast(NAME#DestInfo.ZSuffix#mk) - DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.ImmAllZerosV)), - (!cast(NAME#DestInfo.ZSuffix#mkz) - DestInfo.KRCWM:$mask, addr:$src)>; -} + MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.ImmAllZerosV)), + (!cast(NAME#MaskInfo.ZSuffix#mkz) + MaskInfo.KRCWM:$mask, addr:$src)>; +} + +// Helper class to force mask and broadcast result to same type. +multiclass avx512_broadcast_rm opc, string OpcodeStr, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> : + avx512_broadcast_rm_split; multiclass avx512_fp_broadcast_sd opc, string OpcodeStr, AVX512VLVectorVTInfo _> { @@ -1442,11 +1463,13 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> { let Predicates = [HasDQI] in - defm Z : avx512_broadcast_rm, - EVEX_V512; + defm Z : avx512_broadcast_rm_split, + EVEX_V512; let Predicates = [HasDQI, HasVLX] in - defm Z256 : avx512_broadcast_rm, - EVEX_V256; + defm Z256 : avx512_broadcast_rm_split, + EVEX_V256; } multiclass avx512_common_broadcast_i32x2 opc, string OpcodeStr, @@ -1454,8 +1477,9 @@ multiclass avx512_common_broadcast_i32x2 opc, string OpcodeStr, avx512_common_broadcast_32x2 { let Predicates = [HasDQI, HasVLX] in - defm Z128 : avx512_broadcast_rm, - EVEX_V128; + defm Z128 : avx512_broadcast_rm_split, + EVEX_V128; } defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll index 33bed1861cf..6ca69160067 100644 --- a/test/CodeGen/X86/vector-shuffle-masked.ll +++ b/test/CodeGen/X86/vector-shuffle-masked.ll @@ -1680,3 +1680,158 @@ define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind { %res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> zeroinitializer ret <8 x i64> %res } + +define <4 x float> @test_broadcastf32x2_v4f32(<4 x float> %vec, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v4f32: +; CHECK: # BB#0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x float> @test_broadcastf32x2_v4f32_z(<4 x float> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v4f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <4 x i32> @test_broadcasti32x2_v4i32(<4 x i32> %vec, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v4i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x i32> @test_broadcasti32x2_v4i32_z(<4 x i32> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v4i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x float> @test_broadcastf32x2_v8f32(<8 x float> %vec, <8 x float> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v8f32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> %passthru + ret <8 x float> %res +} + +define <8 x float> @test_broadcastf32x2_v8f32_z(<8 x float> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v8f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <8 x i32> @test_broadcasti32x2_v8i32(<8 x i32> %vec, <8 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v8i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x i32> @test_broadcasti32x2_v8i32_z(<8 x i32> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v8i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +define <16 x float> @test_broadcastf32x2_v16f32_z(<16 x float> %vec, i16 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v16f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <16 x i32> @test_broadcasti32x2_v16i32(<16 x i32> %vec, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v16i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x float> @test_broadcastf32x2_v16f32(<16 x float> %vec, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v16f32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x i32> @test_broadcasti32x2_v16i32_z(<16 x i32> %vec, i16 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v16i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> zeroinitializer + ret <16 x i32> %res +}