From: Justin Lebar <jlebar@google.com>
Date: Wed, 18 Jan 2017 00:08:27 +0000 (+0000)
Subject: [NVPTX] Improve lowering of llvm.ctpop.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=38c1089801be5e11ee2725706afbc5850d92cfb1;p=llvm

[NVPTX] Improve lowering of llvm.ctpop.

Summary:
Avoid an unnecessary conversion operation when using the result of
ctpop.i32 or ctpop.i16 as an i32, as in both cases the ptx instruction
we run returns an i32.

(Previously if we used the value as an i32, we'd do an unnecessary
zext+trunc.)

Reviewers: tra

Subscribers: jholewinski, llvm-commits

Differential Revision: https://reviews.llvm.org/D28721

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292302 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index e88a8c93e42..179830817ed 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2822,15 +2822,19 @@ let hasSideEffects = 0 in {
 // 32-bit has a direct PTX instruction
 def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
+// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
+// pattern that avoids the type conversion if we're truncating the result to
+// i32 anyway.
 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
-// than 16 bits to store)
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
+// If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop Int16Regs:$a),
           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
 
 // fpround f32 -> f16
 def : Pat<(f16 (fpround Float32Regs:$a)),
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 26035963c94..ef70444496e 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -36,8 +36,62 @@ define i64 @test_bitreverse64(i64 %a) {
   ret i64 %val
 }
 
+; CHECK-LABEL: test_popc32(
+define i32 @test_popc32(i32 %a) {
+; CHECK: popc.b32
+  %val = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %val
+}
+
+; CHECK-LABEL: test_popc64
+define i64 @test_popc64(i64 %a) {
+; CHECK: popc.b64
+; CHECK: cvt.u64.u32
+  %val = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %val
+}
+
+; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
+; if this function returns an i32, there's no need to do any type conversions
+; in the ptx.
+; CHECK-LABEL: test_popc64_trunc
+define i32 @test_popc64_trunc(i64 %a) {
+; CHECK: popc.b64
+; CHECK-NOT: cvt.
+  %val = call i64 @llvm.ctpop.i64(i64 %a)
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+
+; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
+; then converting back to i16.
+; CHECK-LABEL: test_popc16
+define void @test_popc16(i16 %a, i16* %b) {
+; CHECK: cvt.u32.u16
+; CHECK: popc.b32
+; CHECK: cvt.u16.u32
+  %val = call i16 @llvm.ctpop.i16(i16 %a)
+  store i16 %val, i16* %b
+  ret void
+}
+
+; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need
+; to do any conversions after calling popc.b32, because that returns an i32.
+; CHECK-LABEL: test_popc16_to_32
+define i32 @test_popc16_to_32(i16 %a) {
+; CHECK: cvt.u32.u16
+; CHECK: popc.b32
+; CHECK-NOT: cvt.
+  %val = call i16 @llvm.ctpop.i16(i16 %a)
+  %zext = zext i16 %val to i32
+  ret i32 %zext
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
 declare i32 @llvm.bitreverse.i32(i32)
 declare i64 @llvm.bitreverse.i64(i64)
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)