AMDGPU: Push bitcasts through build_vector

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 22871e64a3e80b28b01502d3653fa54b36605e35..e214164facbd551139c71f021a39ddef485a5006 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2548,6 +2548,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
      break;
    case ISD::BITCAST: {
      EVT DestVT = N->getValueType(0);
+
+    // Push casts through vector builds. This helps avoid emitting a large
+    // number of copies when materializing floating point vector constants.
+    //
+    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+    if (DestVT.isVector()) {
+      SDValue Src = N->getOperand(0);
+      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+        EVT SrcVT = Src.getValueType();
+        unsigned NElts = DestVT.getVectorNumElements();
+
+        if (SrcVT.getVectorNumElements() == NElts) {
+          EVT DestEltVT = DestVT.getVectorElementType();
+
+          SmallVector<SDValue, 8> CastedElts;
+          SDLoc SL(N);
+          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+            SDValue Elt = Src.getOperand(I);
+            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+          }
+
+          return DAG.getBuildVector(DestVT, SL, CastedElts);
+        }
+      }
+    }
+
      if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
        break;
  
diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll

new file mode 100644 (file)

index 0000000..2482fa7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; The bitcast should be pushed through the bitcasts so the vectors can
+; be broken down and the shared components can be CSEd
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+  %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
+  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+  %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
+  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
+  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
+  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
+  store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
+
+  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
+  store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+  %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
+  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+  %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
+  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  ret void
+}
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 17 Sep 2016 15:44:16 +0000 (15:44 +0000)
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/bitcast-vector-extract.ll	[new file with mode: 0644]	patch \| blob