From: Tim Corringham Date: Fri, 1 Feb 2019 16:51:09 +0000 (+0000) Subject: [AMDGPU] Fix for vector element insertion X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3e0069dcb626a75e3d671443b31ccf15de0b6528;p=llvm [AMDGPU] Fix for vector element insertion Summary: Incorrect code was generated when lowering insertelement operations for vectors with 8 or 16 bit elements. The value being inserted was not adjusted for the position of the element within the 32 bit word and so only the low element within each 32 bit word could receive the intended value. Fixed by simply replicating the value to each element of a congruent vector before the mask and or operation used to update the intended element. A number of affected LIT tests have been updated appropriately. before the mask & or into the intended Reviewers: arsenm, nhaehnle Reviewed By: arsenm Subscribers: llvm-commits, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Tags: #llvm Differential Revision: https://reviews.llvm.org/D57588 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352885 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index b57b2d2fd20..34643c99e11 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4369,12 +4369,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = InsVal; - if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + + // Create a congruent vector with the target value in each element so that + // the required element can be masked and ORed into the target vector. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index e2741c25382..cee091af643 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -814,8 +814,8 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_pk_mul_f16 ; GFX9: v_mul_f16_e32 +; GFX9: v_pk_mul_f16 ; GFX9-NOT: v_max ; GFX9-NOT: v_pk_max define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { diff --git a/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 80309b40e17..b311b6aa29d 100644 --- a/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -112,7 +112,10 @@ entry: ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { entry: %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel @@ -168,9 +171,10 @@ entry: ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] -; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}} define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { entry: %v = insertelement <2 x i16> %vec, i16 1, i32 %sel @@ -184,7 +188,10 @@ entry: ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x10001 +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { entry: %v = insertelement <4 x i16> %vec, i16 1, i32 %sel @@ -197,7 +204,11 @@ entry: ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 +; GCN: s_and_b32 s3, s1, [[K]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 93ee16ea85d..47e080a94ba 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -242,7 +242,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* % ; VI-NOT: _load ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 -; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]] ; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]] ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]] ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]] @@ -261,15 +261,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] -; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] +; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]] -; VI-DAG: buffer_store_short [[BFI]] -; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]] +; VI: buffer_store_short [[BFI]] ; VI: buffer_store_byte [[V_HI2]] define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b @@ -282,10 +281,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] ; VI: buffer_store_dword [[BFI]] define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b @@ -303,9 +303,11 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] +; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505 +; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]] +; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]] ; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} -; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 -; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] +; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]] ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index b4fb59983cb..a8a298045ff 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -446,7 +446,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x12341234 ; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] @@ -611,25 +611,20 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 -; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}} - -; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]] -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]] -; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]] - -; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]] -; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]] -; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD - - -; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] -; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]], -; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0, +; CIVI-DAG: s_and_b32 [[MASKED_VAL:s[0-9]+]], [[VAL]], s[[MASK_LO]] +; VI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[MASKED_VAL]], 16 +; CI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[VAL]], 16 +; CIVI: s_or_b32 [[DUP_VAL:s[0-9]+]], [[MASKED_VAL]], [[SHIFTED_VAL]] +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX9-DAG: s_pack_ll_b32_b16 [[DUP_VAL:s[0-9]+]], [[VAL]], [[VAL]] +; GFX89: v_lshlrev_b64 v[{{[0-9:]+}}], [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} +; CI: v_lshl_b64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SCALED_IDX]] +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll index e35cf7a349a..34e1d201c9c 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll index 07ee65526c9..95e38c36e62 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -6,7 +6,7 @@ ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]