#define DEBUG_TYPE "bpf-lower"
+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+ cl::Hidden, cl::init(false),
+ cl::desc("Expand memcpy into load/store pairs in order"));
+
static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
setMinFunctionAlignment(3);
setPrefFunctionAlignment(3);
- // inline memcpy() for kernel to see explicit copy
- MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
- MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
- MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+ if (BPFExpandMemcpyInOrder) {
+ // LLVM generic code will try to expand memcpy into load/store pairs at this
+ // stage which is before quite a few IR optimization passes, therefore the
+ // loads and stores could potentially be moved apart from each other which
+ // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+ // compilers.
+ //
+ // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+ // of memcpy to later stage in IR optimization pipeline so those load/store
+ // pairs won't be touched and could be kept in order. Hence, we set
+ // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+ // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+ } else {
+ // inline memcpy() for kernel to see explicit copy
+ unsigned CommonMaxStores =
+ STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+ }
// CPU/Feature control
HasAlu32 = STI.getHasAlu32();
return "BPFISD::BR_CC";
case BPFISD::Wrapper:
return "BPFISD::Wrapper";
+ case BPFISD::MEMCPY:
+ return "BPFISD::MEMCPY";
}
return nullptr;
}
return PromotedReg2;
}
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+ MachineBasicBlock *BB)
+ const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB(*MF, MI);
+ unsigned ScratchReg;
+
+ // This function does custom insertion during lowering BPFISD::MEMCPY which
+ // only has two register operands from memcpy semantics, the copy source
+ // address and the copy destination address.
+ //
+ // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+ // a third scratch register to serve as the destination register of load and
+ // source register of store.
+ //
+ // The scratch register here is with the Define | Dead | EarlyClobber flags.
+ // The EarlyClobber flag has the semantic property that the operand it is
+ // attached to is clobbered before the rest of the inputs are read. Hence it
+ // must be unique among the operands to the instruction. The Define flag is
+ // needed to coerce the machine verifier that an Undef value isn't a problem
+ // as we anyway is loading memory into it. The Dead flag is needed as the
+ // value in scratch isn't supposed to be used by any other instruction.
+ ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+ MIB.addReg(ScratchReg,
+ RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+ return BB;
+}
+
MachineBasicBlock *
BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
Opc == BPF::Select_32 ||
Opc == BPF::Select_32_64);
+ bool isMemcpyOp = Opc == BPF::MEMCPY;
+
#ifndef NDEBUG
bool isSelectRIOp = (Opc == BPF::Select_Ri ||
Opc == BPF::Select_Ri_64_32 ||
Opc == BPF::Select_Ri_32_64);
- assert((isSelectRROp || isSelectRIOp) && "Unexpected instr type to insert");
+ assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+ "Unexpected instr type to insert");
#endif
+ if (isMemcpyOp)
+ return EmitInstrWithCustomInserterMemcpy(MI, BB);
+
bool is32BitCmp = (Opc == BPF::Select_32 ||
Opc == BPF::Select_32_64 ||
Opc == BPF::Select_Ri_32 ||
CALL,
SELECT_CC,
BR_CC,
- Wrapper
+ Wrapper,
+ MEMCPY
};
}
unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
bool isSigned) const;
+
+ MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+ MachineBasicBlock *BB)
+ const;
+
};
}
llvm_unreachable("Impossible reg-to-reg copy");
}
+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ uint64_t CopyLen = MI->getOperand(2).getImm();
+ uint64_t Alignment = MI->getOperand(3).getImm();
+ unsigned ScratchReg = MI->getOperand(4).getReg();
+ MachineBasicBlock *BB = MI->getParent();
+ DebugLoc dl = MI->getDebugLoc();
+ unsigned LdOpc, StOpc;
+
+ switch (Alignment) {
+ case 1:
+ LdOpc = BPF::LDB;
+ StOpc = BPF::STB;
+ break;
+ case 2:
+ LdOpc = BPF::LDH;
+ StOpc = BPF::STH;
+ break;
+ case 4:
+ LdOpc = BPF::LDW;
+ StOpc = BPF::STW;
+ break;
+ case 8:
+ LdOpc = BPF::LDD;
+ StOpc = BPF::STD;
+ break;
+ default:
+ llvm_unreachable("unsupported memcpy alignment");
+ }
+
+ unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+ for(unsigned I = 0; I < IterationNum; ++I) {
+ BuildMI(*BB, MI, dl, get(LdOpc))
+ .addReg(ScratchReg).addReg(SrcReg).addImm(I * Alignment);
+ BuildMI(*BB, MI, dl, get(StOpc))
+ .addReg(ScratchReg).addReg(DstReg).addImm(I * Alignment);
+ }
+
+ unsigned BytesLeft = CopyLen & (Alignment - 1);
+ unsigned Offset = IterationNum * Alignment;
+ bool Hanging4Byte = BytesLeft & 0x4;
+ bool Hanging2Byte = BytesLeft & 0x2;
+ bool Hanging1Byte = BytesLeft & 0x1;
+ if (Hanging4Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDW))
+ .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STW))
+ .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+ Offset += 4;
+ }
+ if (Hanging2Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDH))
+ .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STH))
+ .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+ Offset += 2;
+ }
+ if (Hanging1Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDB))
+ .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STB))
+ .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+ }
+
+ BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() == BPF::MEMCPY) {
+ expandMEMCPY(MI);
+ return true;
+ }
+
+ return false;
+}
+
void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned SrcReg, bool IsKill, int FI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned SrcReg,
bool isKill, int FrameIndex,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
+private:
+ void expandMEMCPY(MachineBasicBlock::iterator) const;
+
};
}
SDTCisVT<3, OtherVT>]>;
def SDT_BPFWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+ SDTCisVT<1, i64>,
+ SDTCisVT<2, i64>,
+ SDTCisVT<3, i64>]>;
def BPFcall : SDNode<"BPFISD::CALL", SDT_BPFCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad]>;
def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
def BPFIsBigEndian : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
def : Pat<(i64 (extloadi32 ADDRri:$src)),
(SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
}
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+ def MEMCPY : Pseudo<
+ (outs),
+ (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+ "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+ [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
--- /dev/null
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // Requires the copy size to be a constant.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+
+ unsigned CopyLen = ConstantSize->getZExtValue();
+ unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+ // Impose the same copy length limit as MaxStoresPerMemcpy.
+ if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+ DAG.getConstant(CopyLen, dl, MVT::i64),
+ DAG.getConstant(Align, dl, MVT::i64));
+
+ return Dst.getValue(0);
+}
--- /dev/null
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
#include "BPFFrameLowering.h"
#include "BPFISelLowering.h"
#include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
BPFInstrInfo InstrInfo;
BPFFrameLowering FrameLowering;
BPFTargetLowering TLInfo;
- SelectionDAGTargetInfo TSInfo;
+ BPFSelectionDAGInfo TSInfo;
private:
void initializeEnvironment();
const BPFTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
const TargetRegisterInfo *getRegisterInfo() const override {
BPFISelLowering.cpp
BPFMCInstLower.cpp
BPFRegisterInfo.cpp
+ BPFSelectionDAGInfo.cpp
BPFSubtarget.cpp
BPFTargetMachine.cpp
BPFMIPeephole.cpp
--- /dev/null
+; RUN: llc < %s -march=bpfel -bpf-expand-memcpy-in-order | FileCheck %s
+; RUN: llc < %s -march=bpfeb -bpf-expand-memcpy-in-order | FileCheck %s
+;
+; #define COPY_LEN 9
+;
+; void cal_align1(void *a, void *b)
+; {
+; __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; void cal_align2(short *a, short *b)
+; {
+; __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN 19
+; void cal_align4(int *a, int *b)
+; {
+; __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN 27
+; void cal_align8(long long *a, long long *b)
+; {
+; __builtin_memcpy(a, b, COPY_LEN);
+; }
+
+; Function Attrs: nounwind
+define dso_local void @cal_align1(i8* nocapture %a, i8* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 9, i1 false)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
+
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u8 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u8 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 1)
+; CHECK: *(u8 *)([[DST_REG]] + 1) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 2)
+; CHECK: *(u8 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 3)
+; CHECK: *(u8 *)([[DST_REG]] + 3) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 4)
+; CHECK: *(u8 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 5)
+; CHECK: *(u8 *)([[DST_REG]] + 5) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 6)
+; CHECK: *(u8 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 7)
+; CHECK: *(u8 *)([[DST_REG]] + 7) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align2(i16* nocapture %a, i16* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i16* %a to i8*
+ %1 = bitcast i16* %b to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 %0, i8* align 2 %1, i64 9, i1 false)
+ ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u16 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u16 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 2)
+; CHECK: *(u16 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 4)
+; CHECK: *(u16 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 6)
+; CHECK: *(u16 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align4(i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast i32* %b to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 19, i1 false)
+ ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u32 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u32 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 4)
+; CHECK: *(u32 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 8)
+; CHECK: *(u32 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 12)
+; CHECK: *(u32 *)([[DST_REG]] + 12) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 16)
+; CHECK: *(u16 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 18)
+; CHECK: *(u8 *)([[DST_REG]] + 18) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align8(i64* nocapture %a, i64* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i64* %a to i8*
+ %1 = bitcast i64* %b to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 27, i1 false)
+ ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u64 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u64 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 8)
+; CHECK: *(u64 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 16)
+; CHECK: *(u64 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 24)
+; CHECK: *(u16 *)([[DST_REG]] + 24) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 26)
+; CHECK: *(u8 *)([[DST_REG]] + 26) = [[SCRATCH_REG]]