From: Craig Topper Date: Sat, 14 Jan 2017 07:29:24 +0000 (+0000) Subject: [AVX-512] Replace V_SET0 in AVX-512 patterns with AVX512_128_SET0. Enhance AVX512_128... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=49a15c1e8e517efa7bdb33bab849b49a3dac290e;p=llvm [AVX-512] Replace V_SET0 in AVX-512 patterns with AVX512_128_SET0. Enhance AVX512_128_SET0 expansion to make this possible. We'll now expand AVX512_128_SET0 to an EVEX VXORD if VLX available. Or if its not, but register allocation has selected a non-extended register we will use VEX VXORPS. And if its an extended register without VLX we'll use a 512-bit XOR. Do the same for AVX512_FsFLD0SS/SD. This makes it possible for the register allocator to have all 32 registers available to work with. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292004 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a473073ead2..5ef943c2c76 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -460,7 +460,7 @@ def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), } let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in { def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", [(set VR128X:$dst, (v4i32 immAllZerosV))]>; def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", @@ -470,7 +470,7 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in { + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", @@ -3439,31 +3439,31 @@ let Predicates = [HasAVX512] in { // Move scalar to XMM zero-extended, zeroing a VR128X then do a // MOVS{S,D} to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))), - (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>; + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), - (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), - (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), - (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>; + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>; } // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (V_SET0)), + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>; def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (V_SET0)), + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; let AddedComplexity = 20 in { @@ -3525,11 +3525,11 @@ let Predicates = [HasAVX512] in { } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)), + (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)), + (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), @@ -3538,18 +3538,18 @@ let Predicates = [HasAVX512] in { // Move low f64 and clear high bits. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (V_SET0)), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (V_SET0)), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; // Extract and store. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 8c5dbd51866..d30a0683cf6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6831,14 +6831,33 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); case X86::AVX512_128_SET0: - return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr)); - case X86::AVX512_256_SET0: - return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SD: { + bool HasVLX = Subtarget.hasVLX(); + unsigned SrcReg = MIB->getOperand(0).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) + return Expand2AddrUndef(MIB, + get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); + // Extended register without VLX. Use a larger XOR. + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(SrcReg); + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + } + case X86::AVX512_256_SET0: { + bool HasVLX = Subtarget.hasVLX(); + unsigned SrcReg = MIB->getOperand(0).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) + return Expand2AddrUndef(MIB, + get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr)); + // Extended register without VLX. Use a larger XOR. + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(SrcReg); + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + } case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); - case X86::AVX512_FsFLD0SS: - case X86::AVX512_FsFLD0SD: - return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1812d01711d..e301d0ab7ac 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -446,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>; + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>; + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; } //===----------------------------------------------------------------------===// @@ -461,12 +461,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in { + isPseudo = 1, SchedRW = [WriteZero] in { def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -let Predicates = [NoVLX] in +let Predicates = [NoAVX512] in def : Pat<(v4i32 immAllZerosV), (V_SET0)>; @@ -475,7 +475,7 @@ def : Pat<(v4i32 immAllZerosV), (V_SET0)>; // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllZerosV))]>; } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index b318288e951..1198514be5f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1229,7 +1229,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ; AVX512VL-LABEL: insert_reg_and_zero_v4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX512VL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0