From 077ab85e5a250bec9715e6ffc272a54feec48e97 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 17 Sep 2016 15:44:16 +0000 Subject: [PATCH] AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@281822 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 27 ++++++++ test/CodeGen/AMDGPU/bitcast-vector-extract.ll | 69 +++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 test/CodeGen/AMDGPU/bitcast-vector-extract.ll diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 22871e64a3e..e214164facb 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2548,6 +2548,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::BITCAST: { EVT DestVT = N->getValueType(0); + + // Push casts through vector builds. This helps avoid emitting a large + // number of copies when materializing floating point vector constants. + // + // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => + // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) + if (DestVT.isVector()) { + SDValue Src = N->getOperand(0); + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcVT = Src.getValueType(); + unsigned NElts = DestVT.getVectorNumElements(); + + if (SrcVT.getVectorNumElements() == NElts) { + EVT DestEltVT = DestVT.getVectorElementType(); + + SmallVector CastedElts; + SDLoc SL(N); + for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { + SDValue Elt = Src.getOperand(I); + CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); + } + + return DAG.getBuildVector(DestVT, SL, CastedElts); + } + } + } + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) break; diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll new file mode 100644 index 00000000000..2482fa761b1 --- /dev/null +++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The bitcast should be pushed through the bitcasts so the vectors can +; be broken down and the shared components can be CSEd + +; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { + %vec0.bc = bitcast <8 x i32> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <8 x i32> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { + %vec0.bc = bitcast <4 x i64> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <4 x i64> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { + %vec0.bc = bitcast <4 x i64> to <4 x double> + store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out + + %vec1.bc = bitcast <4 x i64> to <4 x double> + store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { + %vec0.bc = bitcast <16 x i16> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <16 x i16> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +} -- 2.50.1