[NVPTX] Improve lowering of llvm.ctlz.

author Justin Lebar <jlebar@google.com>

Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)

committer Justin Lebar <jlebar@google.com>

Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)
author Justin Lebar <jlebar@google.com>
Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)
committer Justin Lebar <jlebar@google.com>
Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h

index 0b6ebd8cf683e83962555c41ae81c9952a581afa..52a152c89382bc7b335a5ba7f8d641845a93adb6 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -517,6 +517,12 @@ public:
  
    bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
  
+  // The default is to transform llvm.ctlz(x, false) (where false indicates that
+  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+  // and avoids calling ctlz in that case.  We have a dedicated ctlz
+  // instruction, so we say that ctlz is cheap to speculate.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+
  private:
    const NVPTXSubtarget &STI; // cache the subtarget here
    SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td

index 7cc1141e20de0cc38fd367428c4245a2e1e99042..a3ef5c87a78b4f490bbe5914dd289da2111a6a3f 100644 (file)
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2774,18 +2774,32 @@ let hasSideEffects = 0 in {
  // 32-bit has a direct PTX instruction
  def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
  
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
  def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
  
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary.  We also need to subtract 16 because
+// the high-order 16 zeros were counted.
+//
+// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+// use to save one SASS instruction (on sm_35 anyway):
+//
+//   mov.b32 $tmp, {0xffff, $a}
+//   ctlz.b32 $result, $tmp
+//
+// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+// and then ctlz that value.  This way we don't have to subtract 16 from the
+// result.  Unfortunately today we don't have a way to generate
+// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
  def : Pat<(ctlz Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
+          (SUBi16ri (CVT_u16_u32
+           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
  
  // Population count
  let hasSideEffects = 0 in {
diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll

index bed15a9f6a54106592fa2a1142a651092c632f5d..100c0837775737386362b04510b735d7b8aa6933 100644 (file)
--- a/test/CodeGen/NVPTX/ctlz.ll
+++ b/test/CodeGen/NVPTX/ctlz.ll
@@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) readnone
  declare i32 @llvm.ctlz.i32(i32, i1) readnone
  declare i64 @llvm.ctlz.i64(i64, i1) readnone
  
+; There should be no difference between llvm.ctlz.i32(%a, true) and
+; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
+
+; CHECK-LABEL: myctpop(
  define i32 @myctpop(i32 %a) {
-; CHECK: clz.b32
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
    %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
    ret i32 %val
  }
-
-define i16 @myctpop16(i16 %a) {
-; CHECK: clz.b32
-  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
-  ret i16 %val
+; CHECK-LABEL: myctpop_2(
+define i32 @myctpop_2(i32 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
+  ret i32 %val
  }
  
+; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
+; value, so here we have to zero-extend it.
+; CHECK-LABEL: myctpop64(
  define i64 @myctpop64(i64 %a) {
-; CHECK: clz.b64
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
    %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
    ret i64 %val
  }
+; CHECK-LABEL: myctpop64_2(
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
+  ret i64 %val
+}
  
-
-define i32 @myctpop_2(i32 %a) {
-; CHECK: clz.b32
-  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
-  ret i32 %val
+; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
+; natural return width of ptx's clz.b64 instruction.  No conversions should be
+; necessary in the PTX.
+; CHECK-LABEL: myctpop64_as_32(
+define i32 @myctpop64_as_32(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+; CHECK-LABEL: myctpop64_as_32_2(
+define i32 @myctpop64_as_32_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
  }
  
-define i16 @myctpop16_2(i16 %a) {
-; CHECK: clz.b32
+; ctlz.i16 is implemented by extending the input to i32, computing the result,
+; and then truncating the result back down to i16.  But the NVPTX ABI
+; zero-extends i16 return values to i32, so the final truncation doesn't appear
+; in this function.
+; CHECK-LABEL: myctpop_ret16(
+define i16 @myctpop_ret16(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  ret i16 %val
+}
+; CHECK-LABEL: myctpop_ret16_2(
+define i16 @myctpop_ret16_2(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
    %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
    ret i16 %val
  }
  
-define i64 @myctpop64_2(i64 %a) {
-; CHECK: clz.b64
-  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
-  ret i64 %val
+; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
+; remain.
+; CHECK-LABEL: myctpop_store16(
+define void @myctpop_store16(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
+}
+; CHECK-LABEL: myctpop_store16_2(
+define void @myctpop_store16_2(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
  }
author	Justin Lebar <jlebar@google.com>
	Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)
committer	Justin Lebar <jlebar@google.com>
	Wed, 18 Jan 2017 00:07:35 +0000 (00:07 +0000)
lib/Target/NVPTX/NVPTXISelLowering.h		patch \| blob \| history
lib/Target/NVPTX/NVPTXInstrInfo.td		patch \| blob \| history
test/CodeGen/NVPTX/ctlz.ll		patch \| blob \| history