From 265327e7cf0d070bcc30155adaa97871e3eaa656 Mon Sep 17 00:00:00 2001 From: Amaury Sechet Date: Tue, 8 Oct 2019 16:16:26 +0000 Subject: [PATCH] (Re)generate various tests. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374074 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/arm64-rev.ll | 230 +++++++++++- test/CodeGen/AMDGPU/lshr.v2i16.ll | 550 +++++++++++++++++++++++++--- test/CodeGen/AMDGPU/shl.v2i16.ll | 575 ++++++++++++++++++++++++++---- test/CodeGen/ARM/rev.ll | 57 ++- test/CodeGen/Thumb/rev.ll | 25 +- 5 files changed, 1270 insertions(+), 167 deletions(-) diff --git a/test/CodeGen/AArch64/arm64-rev.ll b/test/CodeGen/AArch64/arm64-rev.ll index a04fe05137b..8ceb60432e5 100644 --- a/test/CodeGen/AArch64/arm64-rev.ll +++ b/test/CodeGen/AArch64/arm64-rev.ll @@ -8,10 +8,11 @@ define i32 @test_rev_w(i32 %a) nounwind { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev w0, w0 ; CHECK-NEXT: ret -; GISEL-LABEL: test_rev_w: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: rev w0, w0 -; GISEL-NEXT: ret +; +; FALLBACK-LABEL: test_rev_w: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: rev w0, w0 +; FALLBACK-NEXT: ret entry: %0 = tail call i32 @llvm.bswap.i32(i32 %a) ret i32 %0 @@ -23,10 +24,11 @@ define i64 @test_rev_x(i64 %a) nounwind { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev x0, x0 ; CHECK-NEXT: ret -; GISEL-LABEL: test_rev_x: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: rev x0, x0 -; GISEL-NEXT: ret +; +; FALLBACK-LABEL: test_rev_x: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: rev x0, x0 +; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %0 @@ -40,6 +42,13 @@ define i32 @test_rev_w_srl16(i16 %a) { ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_w_srl16: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: and w8, w0, #0xffff +; FALLBACK-NEXT: rev w8, w8 +; FALLBACK-NEXT: lsr w0, w8, #16 +; FALLBACK-NEXT: ret entry: %0 = zext i16 %a to i32 %1 = tail call i32 @llvm.bswap.i32(i32 %0) @@ -53,6 +62,13 @@ define i32 @test_rev_w_srl16_load(i16 *%a) { ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_w_srl16_load: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: ldrh w8, [x0] +; FALLBACK-NEXT: rev w8, w8 +; FALLBACK-NEXT: lsr w0, w8, #16 +; FALLBACK-NEXT: ret entry: %0 = load i16, i16 *%a %1 = zext i16 %0 to i32 @@ -68,6 +84,14 @@ define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) { ; CHECK-NEXT: add w8, w8, w1, uxtb ; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_w_srl16_add: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: and w8, w1, #0xff +; FALLBACK-NEXT: add w8, w8, w0, uxtb +; FALLBACK-NEXT: rev w8, w8 +; FALLBACK-NEXT: lsr w0, w8, #16 +; FALLBACK-NEXT: ret entry: %0 = zext i8 %a to i32 %1 = zext i8 %b to i32 @@ -85,6 +109,14 @@ define i64 @test_rev_x_srl32(i32 %a) { ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_x_srl32: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: // kill: def $w0 killed $w0 def $x0 +; FALLBACK-NEXT: ubfx x8, x0, #0, #32 +; FALLBACK-NEXT: rev x8, x8 +; FALLBACK-NEXT: lsr x0, x8, #32 +; FALLBACK-NEXT: ret entry: %0 = zext i32 %a to i64 %1 = tail call i64 @llvm.bswap.i64(i64 %0) @@ -98,6 +130,13 @@ define i64 @test_rev_x_srl32_load(i32 *%a) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_x_srl32_load: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: ldr w8, [x0] +; FALLBACK-NEXT: rev x8, x8 +; FALLBACK-NEXT: lsr x0, x8, #32 +; FALLBACK-NEXT: ret entry: %0 = load i32, i32 *%a %1 = zext i32 %0 to i64 @@ -112,6 +151,14 @@ define i64 @test_rev_x_srl32_shift(i64 %a) { ; CHECK-NEXT: ubfx x8, x0, #2, #29 ; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev_x_srl32_shift: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: lsl x8, x0, #33 +; FALLBACK-NEXT: lsr x8, x8, #35 +; FALLBACK-NEXT: rev x8, x8 +; FALLBACK-NEXT: lsr x0, x8, #32 +; FALLBACK-NEXT: ret entry: %0 = shl i64 %a, 33 %1 = lshr i64 %0, 35 @@ -128,6 +175,19 @@ define i32 @test_rev16_w(i32 %X) nounwind { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev16 w0, w0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev16_w: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: lsr w8, w0, #8 +; FALLBACK-NEXT: lsl w9, w0, #8 +; FALLBACK-NEXT: and w10, w8, #0xff0000 +; FALLBACK-NEXT: and w11, w9, #0xff000000 +; FALLBACK-NEXT: and w9, w9, #0xff00 +; FALLBACK-NEXT: orr w10, w11, w10 +; FALLBACK-NEXT: and w8, w8, #0xff +; FALLBACK-NEXT: orr w9, w10, w9 +; FALLBACK-NEXT: orr w0, w9, w8 +; FALLBACK-NEXT: ret entry: %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 @@ -151,6 +211,13 @@ define i64 @test_rev16_x(i64 %a) nounwind { ; CHECK-NEXT: rev x8, x0 ; CHECK-NEXT: ror x0, x8, #16 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev16_x: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: rev x8, x0 +; FALLBACK-NEXT: lsl x9, x8, #48 +; FALLBACK-NEXT: orr x0, x9, x8, lsr #16 +; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) %1 = lshr i64 %0, 16 @@ -164,6 +231,13 @@ define i64 @test_rev32_x(i64 %a) nounwind { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev32 x0, x0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_rev32_x: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: rev x8, x0 +; FALLBACK-NEXT: lsl x9, x8, #32 +; FALLBACK-NEXT: orr x0, x9, x8, lsr #32 +; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) %1 = lshr i64 %0, 32 @@ -178,6 +252,12 @@ define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.8b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64D8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev64.8b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 @@ -189,6 +269,12 @@ define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.4h v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64D16: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev64.4h v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 @@ -200,6 +286,17 @@ define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.2s v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64D32: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: adrp x8, .LCPI13_0 +; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0] +; FALLBACK-NEXT: mov.s v2[1], w8 +; FALLBACK-NEXT: mov.d v0[1], v2[0] +; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1 +; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; FALLBACK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> ret <2 x i32> %tmp2 @@ -211,6 +308,17 @@ define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.2s v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64Df: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: adrp x8, .LCPI14_0 +; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] +; FALLBACK-NEXT: mov.s v2[1], w8 +; FALLBACK-NEXT: mov.d v0[1], v2[0] +; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1 +; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; FALLBACK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> ret <2 x float> %tmp2 @@ -222,6 +330,12 @@ define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.16b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64Q8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev64.16b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -233,6 +347,12 @@ define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.8h v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64Q16: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev64.8h v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 @@ -244,6 +364,14 @@ define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64Q32: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: adrp x8, .LCPI17_0 +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] +; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; FALLBACK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> ret <4 x i32> %tmp2 @@ -255,6 +383,14 @@ define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64Qf: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: adrp x8, .LCPI18_0 +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; FALLBACK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> ret <4 x float> %tmp2 @@ -266,6 +402,12 @@ define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev32.8b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32D8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev32.8b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 @@ -277,6 +419,12 @@ define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev32.4h v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32D16: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev32.4h v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 @@ -288,6 +436,12 @@ define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32Q8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev32.16b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -299,6 +453,12 @@ define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32Q16: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev32.8h v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 @@ -310,6 +470,12 @@ define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev16.8b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev16D8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev16.8b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 @@ -321,6 +487,12 @@ define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev16.16b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev16Q8: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev16.16b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -334,6 +506,12 @@ define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.8b v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64D8_undef: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr d0, [x0] +; FALLBACK-NEXT: rev64.8b v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 @@ -345,6 +523,12 @@ define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32Q16_undef: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: rev32.8h v0, v0 +; FALLBACK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 @@ -359,6 +543,14 @@ define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst ; CHECK-NEXT: st1.h { v0 }[5], [x8] ; CHECK-NEXT: st1.h { v0 }[6], [x1] ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: test_vrev64: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: ldr q0, [x0] +; FALLBACK-NEXT: add x8, x1, #2 // =2 +; FALLBACK-NEXT: st1.h { v0 }[5], [x8] +; FALLBACK-NEXT: st1.h { v0 }[6], [x1] +; FALLBACK-NEXT: ret entry: %0 = bitcast <4 x i16>* %source to <8 x i16>* %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4 @@ -381,6 +573,19 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: str q0, [x1, #176] ; CHECK-NEXT: ret +; +; FALLBACK-LABEL: float_vrev64: +; FALLBACK: // %bb.0: // %entry +; FALLBACK-NEXT: fmov s0, wzr +; FALLBACK-NEXT: mov.s v0[1], v0[0] +; FALLBACK-NEXT: mov.s v0[2], v0[0] +; FALLBACK-NEXT: adrp x8, .LCPI28_0 +; FALLBACK-NEXT: mov.s v0[3], v0[0] +; FALLBACK-NEXT: ldr q1, [x0] +; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] +; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; FALLBACK-NEXT: str q0, [x1, #176] +; FALLBACK-NEXT: ret entry: %0 = bitcast float* %source to <4 x float>* %tmp2 = load <4 x float>, <4 x float>* %0, align 4 @@ -396,10 +601,11 @@ define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret -; GISEL-LABEL: test_vrev32_bswap: -; GISEL: // %bb.0: -; GISEL-NEXT: rev32.16b v0, v0 -; GISEL-NEXT: ret +; +; FALLBACK-LABEL: test_vrev32_bswap: +; FALLBACK: // %bb.0: +; FALLBACK-NEXT: rev32.16b v0, v0 +; FALLBACK-NEXT: ret %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) ret <4 x i32> %bswap } diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll index e88550fb92c..5275a819080 100644 --- a/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,46 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; GCN-LABEL: {{^}}s_lshr_v2i16: -; GFX9: s_load_dword [[LHS:s[0-9]+]] -; GFX9: s_load_dword [[RHS:s[0-9]+]] -; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] - -; CIVI: s_load_dword [[LHS:s[0-9]+]] -; CIVI: s_load_dword [[RHS:s[0-9]+]] -; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI-DAG: s_lshl_b32 -; CIVI: v_or_b32_e32 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +; GFX9-LABEL: s_lshr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: s_lshr_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s1, s5, s4 +; VI-NEXT: s_and_b32 s4, s0, s4 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: s_lshr_b32 s0, s5, s0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_bfe_u32 v0, s1, v0, 16 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_lshr_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dword s0, s[0:1], 0xc +; CI-NEXT: s_mov_b32 s3, 0xffff +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_lshr_b32 s8, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, s3 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 s0, s1, s8 +; CI-NEXT: s_and_b32 s2, s2, s3 +; CI-NEXT: v_bfe_u32 v0, s2, v0, 16 +; CI-NEXT: s_lshl_b32 s0, s0, 16 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}v_lshr_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] - -; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] -; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]] -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: v_lshr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_lshr_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_lshr_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_bfe_u32 v2, v2, v3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -53,11 +141,71 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ret void } -; GCN-LABEL: {{^}}lshr_v_s_v2i16: -; GFX9: s_load_dword [[RHS:s[0-9]+]] -; GFX9: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +; GFX9-LABEL: lshr_v_s_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: lshr_v_s_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: lshr_v_s_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s9, s8, 16 +; CI-NEXT: s_mov_b32 s10, 0xffff +; CI-NEXT: s_and_b32 s8, s8, s10 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 +; CI-NEXT: v_bfe_u32 v2, v2, s8, 16 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -68,11 +216,71 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 ret void } -; GCN-LABEL: {{^}}lshr_s_v_v2i16: -; GFX9: s_load_dword [[LHS:s[0-9]+]] -; GFX9: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +; GFX9-LABEL: lshr_s_v_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: lshr_s_v_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: lshr_s_v_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s9, s8, 16 +; CI-NEXT: s_mov_b32 s10, 0xffff +; CI-NEXT: s_and_b32 s8, s8, s10 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 +; CI-NEXT: v_bfe_u32 v2, s8, v2, 16 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -83,10 +291,64 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 ret void } -; GCN-LABEL: {{^}}lshr_imm_v_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: lshr_imm_v_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: lshr_imm_v_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: lshr_imm_v_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshr_b32_e32 v3, 8, v3 +; CI-NEXT: v_bfe_u32 v2, 8, v2, 16 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -97,10 +359,59 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ret void } -; GCN-LABEL: {{^}}lshr_v_imm_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: lshr_v_imm_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: lshr_v_imm_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: lshr_v_imm_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -111,13 +422,84 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ret void } -; GCN-LABEL: {{^}}v_lshr_v4i16: -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: v_lshr_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_lshr_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 +; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_lshr_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v4, s8, v4 +; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v5, s8, v5 +; CI-NEXT: v_bfe_u32 v3, v3, v5, 16 +; CI-NEXT: v_lshrrev_b32_e32 v5, v9, v7 +; CI-NEXT: v_bfe_u32 v2, v2, v4, 16 +; CI-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v3, v3, v5 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -130,12 +512,66 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ret void } -; GCN-LABEL: {{^}}lshr_v_imm_v4i16: -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: lshr_v_imm_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: lshr_v_imm_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; CI-LABEL: lshr_v_imm_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b32 s8, 0xff00ff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll index b116e2ee6a2..c6816d351f2 100644 --- a/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,60 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; GCN-LABEL: {{^}}s_shl_v2i16: -; GFX9: s_load_dword [[LHS:s[0-9]+]] -; GFX9: s_load_dword [[RHS:s[0-9]+]] -; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] - -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_lshr_b32 -; VI: s_lshr_b32 -; VI: s_and_b32 -; VI: s_and_b32 -; VI: s_lshl_b32 -; VI: s_lshl_b32 -; VI: s_lshl_b32 -; VI: s_and_b32 -; VI: s_or_b32 - -; CI: s_load_dword s -; CI: s_load_dword s -; CI: s_lshr_b32 -; CI: s_and_b32 -; CI: s_lshr_b32 -; CI: s_lshl_b32 -; CI: s_lshl_b32 -; CI: s_lshl_b32 -; CI: s_and_b32 -; CI: s_or_b32 -; CI: _store_dword define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +; GFX9-LABEL: s_shl_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: s_shl_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_mov_b32 s3, 0xffff +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshr_b32 s8, s0, 16 +; VI-NEXT: s_and_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s0, s0, s3 +; VI-NEXT: s_lshl_b32 s0, s2, s0 +; VI-NEXT: s_lshl_b32 s1, s1, s8 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s0, s0, s3 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_shl_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dword s0, s[0:1], 0xc +; CI-NEXT: s_mov_b32 s3, 0xffff +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_and_b32 s8, s0, s3 +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_lshl_b32 s0, s1, s0 +; CI-NEXT: s_lshl_b32 s1, s2, s8 +; CI-NEXT: s_lshl_b32 s0, s0, 16 +; CI-NEXT: s_and_b32 s1, s1, s3 +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}v_shl_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] - -; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: v_shl_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_shl_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_shl_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_and_b32_e32 v5, s8, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; CI-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -67,11 +142,71 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a ret void } -; GCN-LABEL: {{^}}shl_v_s_v2i16: -; GFX9: s_load_dword [[RHS:s[0-9]+]] -; GFX9: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +; GFX9-LABEL: shl_v_s_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: shl_v_s_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: shl_v_s_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s9, 0xffff +; CI-NEXT: s_lshr_b32 s10, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s9 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 +; CI-NEXT: v_and_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -82,11 +217,71 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ret void } -; GCN-LABEL: {{^}}shl_s_v_v2i16: -; GFX9: s_load_dword [[LHS:s[0-9]+]] -; GFX9: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +; GFX9-LABEL: shl_s_v_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: shl_s_v_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: shl_s_v_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -97,10 +292,66 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ret void } -; GCN-LABEL: {{^}}shl_imm_v_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: shl_imm_v_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: shl_imm_v_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: shl_imm_v_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, s4, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 +; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -111,10 +362,60 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1 ret void } -; GCN-LABEL: {{^}}shl_v_imm_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: shl_v_imm_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: shl_v_imm_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: shl_v_imm_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -125,13 +426,84 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1 ret void } -; GCN-LABEL: {{^}}v_shl_v4i16: -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: v_shl_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_shl_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 +; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_shl_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_and_b32_e32 v8, s8, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_and_b32_e32 v9, s8, v5 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, v5, v7 +; CI-NEXT: v_lshlrev_b32_e32 v3, v9, v3 +; CI-NEXT: v_lshlrev_b32_e32 v4, v4, v6 +; CI-NEXT: v_lshlrev_b32_e32 v2, v8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_or_b32_e32 v3, v3, v5 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -144,12 +516,73 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a ret void } -; GCN-LABEL: {{^}}shl_v_imm_v4i16: -; GCN: {{buffer|flat|global}}_load_dwordx2 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +; GFX9-LABEL: shl_v_imm_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: shl_v_imm_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_mov_b32 s4, 0xff000000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_and_b32_e32 v4, s4, v4 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; CI-LABEL: shl_v_imm_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b32 s8, 0xff00 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; CI-NEXT: v_and_b32_e32 v4, s8, v4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll index a36526ff1fb..b97dbc844e0 100644 --- a/test/CodeGen/ARM/rev.ll +++ b/test/CodeGen/ARM/rev.ll @@ -1,8 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s define i32 @test1(i32 %X) nounwind { -; CHECK-LABEL: test1 -; CHECK: rev16 r0, r0 +; CHECK-LABEL: test1: +; CHECK: @ %bb.0: +; CHECK-NEXT: rev16 r0, r0 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 %tmp4 = shl i32 %X15, 8 @@ -17,8 +20,10 @@ define i32 @test1(i32 %X) nounwind { } define i32 @test2(i32 %X) nounwind { -; CHECK-LABEL: test2 -; CHECK: revsh r0, r0 +; CHECK-LABEL: test2: +; CHECK: @ %bb.0: +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %X, 8 %tmp1.upgrd.1 = trunc i32 %tmp1 to i16 %tmp3 = trunc i32 %X to i16 @@ -31,9 +36,11 @@ define i32 @test2(i32 %X) nounwind { ; rdar://9147637 define i32 @test3(i16 zeroext %a) nounwind { -entry: ; CHECK-LABEL: test3: -; CHECK: revsh r0, r0 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr +entry: %0 = tail call i16 @llvm.bswap.i16(i16 %a) %1 = sext i16 %0 to i32 ret i32 %1 @@ -42,9 +49,11 @@ entry: declare i16 @llvm.bswap.i16(i16) nounwind readnone define i32 @test4(i16 zeroext %a) nounwind { -entry: ; CHECK-LABEL: test4: -; CHECK: revsh r0, r0 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr +entry: %conv = zext i16 %a to i32 %shr9 = lshr i16 %a, 8 %conv2 = zext i16 %shr9 to i32 @@ -57,9 +66,11 @@ entry: ; rdar://9609059 define i32 @test5(i32 %i) nounwind readnone { +; CHECK-LABEL: test5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr entry: -; CHECK-LABEL: test5 -; CHECK: revsh r0, r0 %shl = shl i32 %i, 24 %shr = ashr exact i32 %shl, 16 %shr23 = lshr i32 %i, 8 @@ -70,9 +81,11 @@ entry: ; rdar://9609108 define i32 @test6(i32 %x) nounwind readnone { +; CHECK-LABEL: test6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: rev16 r0, r0 +; CHECK-NEXT: bx lr entry: -; CHECK-LABEL: test6 -; CHECK: rev16 r0, r0 %and = shl i32 %x, 8 %shl = and i32 %and, 65280 %and2 = lshr i32 %x, 8 @@ -87,10 +100,12 @@ entry: ; rdar://9164521 define i32 @test7(i32 %a) nounwind readnone { +; CHECK-LABEL: test7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: rev r0, r0 +; CHECK-NEXT: lsr r0, r0, #16 +; CHECK-NEXT: bx lr entry: -; CHECK-LABEL: test7 -; CHECK: rev r0, r0 -; CHECK: lsr r0, r0, #16 %and = lshr i32 %a, 8 %shr3 = and i32 %and, 255 %and2 = shl i32 %a, 8 @@ -100,9 +115,11 @@ entry: } define i32 @test8(i32 %a) nounwind readnone { +; CHECK-LABEL: test8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr entry: -; CHECK-LABEL: test8 -; CHECK: revsh r0, r0 %and = lshr i32 %a, 8 %shr4 = and i32 %and, 255 %and2 = shl i32 %a, 8 @@ -114,9 +131,11 @@ entry: ; rdar://10750814 define zeroext i16 @test9(i16 zeroext %v) nounwind readnone { +; CHECK-LABEL: test9: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: rev16 r0, r0 +; CHECK-NEXT: bx lr entry: -; CHECK-LABEL: test9 -; CHECK: rev16 r0, r0 %conv = zext i16 %v to i32 %shr4 = lshr i32 %conv, 8 %shl = shl nuw nsw i32 %conv, 8 diff --git a/test/CodeGen/Thumb/rev.ll b/test/CodeGen/Thumb/rev.ll index 3e947022e60..9ac65bf5c8e 100644 --- a/test/CodeGen/Thumb/rev.ll +++ b/test/CodeGen/Thumb/rev.ll @@ -1,8 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumb-eabi -mattr=+v6 %s -o - | FileCheck %s define i32 @test1(i32 %X) nounwind { -; CHECK: test1 -; CHECK: rev16 r0, r0 +; CHECK-LABEL: test1: +; CHECK: @ %bb.0: +; CHECK-NEXT: rev16 r0, r0 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 %tmp4 = shl i32 %X15, 8 @@ -17,8 +20,10 @@ define i32 @test1(i32 %X) nounwind { } define i32 @test2(i32 %X) nounwind { -; CHECK: test2 -; CHECK: revsh r0, r0 +; CHECK-LABEL: test2: +; CHECK: @ %bb.0: +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %X, 8 %tmp1.upgrd.1 = trunc i32 %tmp1 to i16 %tmp3 = trunc i32 %X to i16 @@ -31,9 +36,11 @@ define i32 @test2(i32 %X) nounwind { ; rdar://9147637 define i32 @test3(i16 zeroext %a) nounwind { -entry: ; CHECK-LABEL: test3: -; CHECK: revsh r0, r0 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr +entry: %0 = tail call i16 @llvm.bswap.i16(i16 %a) %1 = sext i16 %0 to i32 ret i32 %1 @@ -42,9 +49,11 @@ entry: declare i16 @llvm.bswap.i16(i16) nounwind readnone define i32 @test4(i16 zeroext %a) nounwind { -entry: ; CHECK-LABEL: test4: -; CHECK: revsh r0, r0 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: revsh r0, r0 +; CHECK-NEXT: bx lr +entry: %conv = zext i16 %a to i32 %shr9 = lshr i16 %a, 8 %conv2 = zext i16 %shr9 to i32 -- 2.40.0