From: Craig Topper Date: Sat, 11 Jun 2016 13:43:21 +0000 (+0000) Subject: [AVX512] Lower v8i64 and v16i32 to pshufd when possible. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=64162b500842558c9babd9f2cb035db5890e68ef;p=llvm [AVX512] Lower v8i64 and v16i32 to pshufd when possible. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272473 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 95b74b18a92..5f4bff75ee1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11780,6 +11780,25 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Shuf128; + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getBitcast( + MVT::v8i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, + DAG.getBitcast(MVT::v16i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } + } + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) @@ -11803,15 +11822,27 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the four 128-bit + // lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return V; + } + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Shift; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) - return Unpck; - // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index 98ba509bfde..91a4f16e22d 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F @@ -76,9 +76,9 @@ define i32 @sad_16i8() nounwind { ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq @@ -101,9 +101,9 @@ define i32 @sad_16i8() nounwind { ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: retq @@ -323,9 +323,9 @@ define i32 @sad_32i8() nounwind { ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq @@ -350,9 +350,9 @@ define i32 @sad_32i8() nounwind { ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: retq @@ -805,9 +805,9 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq @@ -833,9 +833,9 @@ define i32 @sad_avx64i8() nounwind { ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 12a913c74f8..c3d3f7eaf4d 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW @@ -225,3 +225,21 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { %v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32> ret <8 x float> %v2 } + +define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; ALL-NEXT: retq + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: retq + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index 067c3f1f3ea..a4455e76d5b 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -107,3 +107,4 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19 %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } +