class PTXReadSRegIntrinsic_r32<string name>
: Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
-
class PTXReadSRegIntrinsic_r64<string name>
: Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>,
GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+// Intrinsics to read registers with non-constant values. E.g. the values that
+// do change over the kernel lifetime. Such reads should not be CSE'd.
+class PTXReadNCSRegIntrinsic_r32<string name>
+ : Intrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly]>,
+ GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+class PTXReadNCSRegIntrinsic_r64<string name>
+ : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly]>,
+ GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+
defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
def int_nvvm_read_ptx_sreg_lanemask_gt :
PTXReadSRegIntrinsic_r32<"lanemask_gt">;
-def int_nvvm_read_ptx_sreg_clock : PTXReadSRegIntrinsic_r32<"clock">;
-def int_nvvm_read_ptx_sreg_clock64 : PTXReadSRegIntrinsic_r64<"clock64">;
+def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32<"clock">;
+def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64<"clock64">;
-def int_nvvm_read_ptx_sreg_pm0 : PTXReadSRegIntrinsic_r32<"pm0">;
-def int_nvvm_read_ptx_sreg_pm1 : PTXReadSRegIntrinsic_r32<"pm1">;
-def int_nvvm_read_ptx_sreg_pm2 : PTXReadSRegIntrinsic_r32<"pm2">;
-def int_nvvm_read_ptx_sreg_pm3 : PTXReadSRegIntrinsic_r32<"pm3">;
+def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32<"pm0">;
+def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
+def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
+def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
ret i32 %zext
}
+; Most of nvvm.read.ptx.sreg.* intrinsics always return the same value and may
+; be CSE'd.
+; CHECK-LABEL: test_tid
+define i32 @test_tid() {
+; CHECK: mov.u32 %r{{.*}}, %tid.x;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NOT: mov.u32 %r{{.*}}, %tid.x;
+ %b = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %ret = add i32 %a, %b
+; CHECK: ret
+ ret i32 %ret
+}
+
+; reading clock() or clock64() should not be CSE'd as each read may return
+; different value.
+; CHECK-LABEL: test_clock
+define i32 @test_clock() {
+; CHECK: mov.u32 %r{{.*}}, %clock;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
+; CHECK: mov.u32 %r{{.*}}, %clock;
+ %b = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
+ %ret = add i32 %a, %b
+; CHECK: ret
+ ret i32 %ret
+}
+
+; CHECK-LABEL: test_clock64
+define i64 @test_clock64() {
+; CHECK: mov.u64 %r{{.*}}, %clock64;
+ %a = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
+; CHECK: mov.u64 %r{{.*}}, %clock64;
+ %b = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
+ %ret = add i64 %a, %b
+; CHECK: ret
+ ret i64 %ret
+}
+
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.nvvm.sqrt.f(float)
declare i16 @llvm.ctpop.i16(i16)
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.clock()
+declare i64 @llvm.nvvm.read.ptx.sreg.clock64()