From: Craig Topper Date: Fri, 13 Jul 2018 06:25:31 +0000 (+0000) Subject: [X86] Prefer MOVSS/SD over BLEND under optsize in isel. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1b59f048c7054b6461dfac9a40e0eb7d6b2efb3c;p=llvm [X86] Prefer MOVSS/SD over BLEND under optsize in isel. Previously we iseled to blend, commuted to another blend, and then commuted back to movss/movsd or blend depending on optsize. Now we do it directly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336976 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index ec4a50a0b7b..5e30e00a1bd 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -944,6 +944,8 @@ let RecomputePerFunction = 1 in { def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " "MF->getFunction().optForSize()">; + def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || " + "!Subtarget->hasSSE41()">; } def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index da8c2f8ad8f..10c0a7febe9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -191,8 +191,9 @@ multiclass sse12_move_rr { + Domain d, string Name, Predicate pred> { // AVX + let Predicates = [UseAVX, OptForSize] in defm V#NAME : sse12_move_rr, @@ -204,6 +205,7 @@ multiclass sse12_move, VEX_WIG; // SSE1 & 2 let Constraints = "$src1 = $dst" in { + let Predicates = [pred, NoSSE41_Or_OptForSize] in defm NAME : sse12_move_rr; } @@ -235,9 +237,9 @@ multiclass sse12_move_rm, XS; + SSEPackedSingle, "MOVSS", UseSSE1>, XS; defm MOVSD : sse12_move, XD; + SSEPackedDouble, "MOVSD", UseSSE2>, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { defm MOVSS : sse12_move_rm; } +let Predicates = [UseAVX, OptForSize] in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; +} + let Predicates = [UseSSE1] in { - let Predicates = [NoSSE41] in { + let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), @@ -6380,17 +6391,27 @@ let Predicates = [HasAVX2] in { (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -// Patterns -// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or -// on targets where they have equal performance. These were changed to use -// blends because blends have better throughput on SandyBridge and Haswell, but -// movs[s/d] are 1-2 byte shorter instructions. +// Prefer a movss or movsd over a blendps when optimizing for size. these were +// changed to use blends because blends have better throughput on sandybridge +// and haswell, but movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseAVX] in { + let Predicates = [UseAVX, OptForSpeed] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; + } + // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; @@ -6408,16 +6429,25 @@ let Predicates = [UseAVX] in { (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; } -// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or -// on targets where they have equal performance. These were changed to use -// blends because blends have better throughput on SandyBridge and Haswell, but -// movs[s/d] are 1-2 byte shorter instructions. -let Predicates = [UseSSE41] in { +// Prefer a movss or movsd over a blendps when optimizing for size. these were +// changed to use blends because blends have better throughput on sandybridge +// and haswell, but movs[s/d] are 1-2 byte shorter instructions. +let Predicates = [UseSSE41, OptForSpeed] in { // With SSE41 we can use blends for these patterns. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; }