[X86] Allow folding of stack reloads when loading a subreg of the spilled reg

author Michael Kuperstein <mkuper@google.com>

Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)

committer Michael Kuperstein <mkuper@google.com>

Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)
author Michael Kuperstein <mkuper@google.com>
Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)
committer Michael Kuperstein <mkuper@google.com>
Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h

index 41804f82d3ca91f7924cd874f311b853ee772a2b..fec51a3d03306bef26ab40959ad2d1c00bb63c2b 100644 (file)
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -817,6 +817,20 @@ public:
    /// anything was changed.
    virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }
  
+  /// Check whether the target can fold a load that feeds a subreg operand
+  /// (or a subreg operand that feeds a store).
+  /// For example, X86 may want to return true if it can fold
+  /// movl (%esp), %eax
+  /// subb, %al, ...
+  /// Into:
+  /// subb (%esp), ...
+  ///
+  /// Ideally, we'd like the target implementation of foldMemoryOperand() to
+  /// reject subregs - but since this behavior used to be enforced in the
+  /// target-independent code, moving this responsibility to the targets
+  /// has the potential of causing nasty silent breakage in out-of-tree targets.
+  virtual bool isSubregFoldable() const { return false; }
+
    /// Attempt to fold a load or store of the specified stack
    /// slot into the specified machine instruction for the specified operand(s).
    /// If this is possible, a new instruction is returned with the specified
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp

index 3ccc18d120f232d2f314e418fd2411d548c9eb4e..3e5ae5f5f071109bbe653299a9de4fa4272a9984 100644 (file)
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -739,9 +739,12 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
    bool WasCopy = MI->isCopy();
    unsigned ImpReg = 0;
  
-  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
-                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
-                       MI->getOpcode() == TargetOpcode::STACKMAP);
+  // Spill subregs if the target allows it.
+  // We always want to spill subregs for stackmap/patchpoint pseudos.
+  bool SpillSubRegs = TII.isSubregFoldable() ||
+                      MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                      MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                      MI->getOpcode() == TargetOpcode::STACKMAP;
  
    // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
    // operands.
@@ -754,7 +757,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
        ImpReg = MO.getReg();
        continue;
      }
-    // FIXME: Teach targets to deal with subregs.
+
      if (!SpillSubRegs && MO.getSubReg())
        return false;
      // We cannot fold a load instruction into a def.
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp

index 265b4bfaff1342f39627056e5558eb753807124c..01f91b96b58a7fa29bdf2b39c88e1f7cc6ec5a50 100644 (file)
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -515,6 +515,31 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
    assert(MBB && "foldMemoryOperand needs an inserted instruction");
    MachineFunction &MF = *MBB->getParent();
  
+  // If we're not folding a load into a subreg, the size of the load is the
+  // size of the spill slot. But if we are, we need to figure out what the
+  // actual load size is.
+  int64_t MemSize = 0;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  if (Flags & MachineMemOperand::MOStore) {
+    MemSize = MFI.getObjectSize(FI);
+  } else {
+    for (unsigned Idx : Ops) {
+      int64_t OpSize = MFI.getObjectSize(FI);
+
+      if (auto SubReg = MI.getOperand(Idx).getSubReg()) {
+        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
+        if (SubRegSize > 0 && !(SubRegSize % 8))
+          OpSize = SubRegSize / 8;
+      }
+
+      MemSize = std::max(MemSize, OpSize);
+    }
+  }
+
+  assert(MemSize && "Did not expect a zero-sized stack slot");
+
    MachineInstr *NewMI = nullptr;
  
    if (MI.getOpcode() == TargetOpcode::STACKMAP ||
@@ -538,10 +563,9 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
      assert((!(Flags & MachineMemOperand::MOLoad) ||
              NewMI->mayLoad()) &&
             "Folded a use to a non-load!");
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
      assert(MFI.getObjectOffset(FI) != -1);
      MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI),
+        MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize,
          MFI.getObjectAlignment(FI));
      NewMI->addMemOperand(MF, MMO);
  
@@ -558,7 +582,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
  
    const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
    MachineBasicBlock::iterator Pos = MI;
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
  
    if (Flags == MachineMemOperand::MOStore)
      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 967857c4c5f4019fdc8e1802bbad96ed1cd79ab6..0a85a9902c05ac35997c80472fbe853702f81be8 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -6843,6 +6843,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
    if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
      return nullptr;
  
+  // Don't fold subreg spills, or reloads that use a high subreg.
+  for (auto Op : Ops) {
+    MachineOperand &MO = MI.getOperand(Op);
+    auto SubReg = MO.getSubReg();
+    if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+      return nullptr;
+  }
+
    const MachineFrameInfo &MFI = MF.getFrameInfo();
    unsigned Size = MFI.getObjectSize(FrameIndex);
    unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
@@ -6967,6 +6975,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
      LiveIntervals *LIS) const {
+
+  // TODO: Support the case where LoadMI loads a wide register, but MI
+  // only uses a subreg.
+  for (auto Op : Ops) {
+    if (MI.getOperand(Op).getSubReg())
+      return nullptr;
+  }
+
    // If loading from a FrameIndex, fold directly from the FrameIndex.
    unsigned NumOps = LoadMI.getDesc().getNumOperands();
    int FrameIndex;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h

index bc43820a024c4372d6fe4e2775d5940d3c94881b..a0292bbdaed4db3106e337dc73d7a2b65ed9b22d 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -378,6 +378,10 @@ public:
  
    bool expandPostRAPseudo(MachineInstr &MI) const override;
  
+  /// Check whether the target can fold a load that feeds a subreg operand
+  /// (or a subreg operand that feeds a store).
+  bool isSubregFoldable() const override { return true; }
+
    /// foldMemoryOperand - If this target supports it, fold a load or store of
    /// the specified stack slot into the specified machine instruction for the
    /// specified operand(s).  If this is possible, the target should perform the
diff --git a/test/CodeGen/X86/partial-fold32.ll b/test/CodeGen/X86/partial-fold32.ll

index ba3f73ba90fce569fb949e4c179ba89e1ad0da68..7fc1ed3521e9d8e73f36521b31e4bbbb3eddcbaf 100644 (file)
--- a/test/CodeGen/X86/partial-fold32.ll
+++ b/test/CodeGen/X86/partial-fold32.ll
@@ -3,8 +3,7 @@
  define fastcc i8 @fold32to8(i32 %add, i8 %spill) {
  ; CHECK-LABEL: fold32to8:
  ; CHECK:    movl %ecx, (%esp) # 4-byte Spill
-; CHECK:    movl (%esp), %eax # 4-byte Reload
-; CHECK:    subb %al, %dl
+; CHECK:    subb (%esp), %dl  # 1-byte Folded Reload
  entry:
    tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
    %trunc = trunc i32 %add to i8
diff --git a/test/CodeGen/X86/partial-fold64.ll b/test/CodeGen/X86/partial-fold64.ll

index b9ea7d6773a3fde5d1c1d4d81f1bb76384b58fa7..15c9d194be41a602570c35d6c0dc2d15ed10fda6 100644 (file)
--- a/test/CodeGen/X86/partial-fold64.ll
+++ b/test/CodeGen/X86/partial-fold64.ll
@@ -3,8 +3,7 @@
  define i32 @fold64to32(i64 %add, i32 %spill) {
  ; CHECK-LABEL: fold64to32:
  ; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; CHECK:    subl %eax, %esi
+; CHECK:    subl -{{[0-9]+}}(%rsp), %esi # 4-byte Folded Reload
  entry:
    tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
    %trunc = trunc i64 %add to i32
@@ -15,8 +14,7 @@ entry:
  define i8 @fold64to8(i64 %add, i8 %spill) {
  ; CHECK-LABEL: fold64to8:
  ; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; CHECK:    subb %al, %sil
+; CHECK:    subb -{{[0-9]+}}(%rsp), %sil # 1-byte Folded Reload
  entry:
    tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
    %trunc = trunc i64 %add to i8
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll

index 0453dc18d01f44910b3339cf5bfdb12b30076a20..78522948f0a6f37c7ce9b28f336c8bd029f200d1 100644 (file)
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -4788,9 +4788,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
  ; AVX1-NEXT:    orl %ebx, %r14d
  ; AVX1-NEXT:    shlq $32, %r14
  ; AVX1-NEXT:    orq %r15, %r14
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
  ; AVX1-NEXT:    callq __truncdfhf2
  ; AVX1-NEXT:    movw %ax, %bx
  ; AVX1-NEXT:    shll $16, %ebx
@@ -4856,9 +4855,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
  ; AVX2-NEXT:    orl %ebx, %r14d
  ; AVX2-NEXT:    shlq $32, %r14
  ; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
  ; AVX2-NEXT:    callq __truncdfhf2
  ; AVX2-NEXT:    movw %ax, %bx
  ; AVX2-NEXT:    shll $16, %ebx
@@ -5585,9 +5583,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
  ; AVX1-NEXT:    vzeroupper
  ; AVX1-NEXT:    callq __truncdfhf2
  ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
  ; AVX1-NEXT:    callq __truncdfhf2
  ; AVX1-NEXT:    movl %eax, %r12d
  ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
@@ -5654,9 +5651,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
  ; AVX2-NEXT:    vzeroupper
  ; AVX2-NEXT:    callq __truncdfhf2
  ; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
  ; AVX2-NEXT:    callq __truncdfhf2
  ; AVX2-NEXT:    movl %eax, %r12d
  ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
author	Michael Kuperstein <mkuper@google.com>
	Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)
committer	Michael Kuperstein <mkuper@google.com>
	Wed, 23 Nov 2016 18:33:49 +0000 (18:33 +0000)
include/llvm/Target/TargetInstrInfo.h		patch \| blob \| history
lib/CodeGen/InlineSpiller.cpp		patch \| blob \| history
lib/CodeGen/TargetInstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.h		patch \| blob \| history
test/CodeGen/X86/partial-fold32.ll		patch \| blob \| history
test/CodeGen/X86/partial-fold64.ll		patch \| blob \| history
test/CodeGen/X86/vector-half-conversions.ll		patch \| blob \| history