From 611bf0527cc31af098da573f03f29d26ec2c2536 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@intel.com> Date: Tue, 1 Oct 2019 22:40:03 +0000 Subject: [PATCH] Revert r373172 "[X86] Add custom isel logic to match VPTERNLOG from 2 logic ops." This seems to be causing some performance regresions that I'm trying to investigate. One thing that stands out is that this transform can increase the live range of the operands of the earlier logic op. This can be bad for register allocation. If there are two logic op inputs we should really combine the one that is closest, but SelectionDAG doesn't have a good way to do that. Maybe we need to do this as a basic block transform in Machine IR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373401 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 80 +--------- test/CodeGen/X86/avx512-cvt.ll | 17 +- test/CodeGen/X86/avx512-gfni-intrinsics.ll | 144 ++++++++--------- test/CodeGen/X86/machine-combiner-int-vec.ll | 117 +++++--------- test/CodeGen/X86/midpoint-int-vec-128.ll | 12 +- test/CodeGen/X86/midpoint-int-vec-256.ll | 24 +-- test/CodeGen/X86/midpoint-int-vec-512.ll | 84 +++++----- test/CodeGen/X86/sadd_sat_vec.ll | 57 ++----- test/CodeGen/X86/ssub_sat_vec.ll | 57 ++----- test/CodeGen/X86/vec_int_to_fp.ll | 16 +- test/CodeGen/X86/vector-fshl-256.ll | 9 +- test/CodeGen/X86/vector-fshl-512.ll | 56 ++++--- test/CodeGen/X86/vector-fshl-rot-256.ll | 20 +-- test/CodeGen/X86/vector-fshl-rot-512.ll | 42 ++--- test/CodeGen/X86/vector-fshr-256.ll | 29 ++-- test/CodeGen/X86/vector-fshr-512.ll | 154 ++++++++++--------- test/CodeGen/X86/vector-fshr-rot-256.ll | 20 +-- test/CodeGen/X86/vector-fshr-rot-512.ll | 82 +++++----- test/CodeGen/X86/vector-idiv-sdiv-512.ll | 6 +- test/CodeGen/X86/vector-rotate-128.ll | 77 ++-------- test/CodeGen/X86/vector-rotate-256.ll | 69 +++------ test/CodeGen/X86/vector-rotate-512.ll | 92 ++++++----- test/CodeGen/X86/vector-shift-ashr-128.ll | 3 +- test/CodeGen/X86/vector-shift-ashr-256.ll | 18 ++- test/CodeGen/X86/vector-shift-ashr-512.ll | 18 ++- test/CodeGen/X86/vector-shift-ashr-sub128.ll | 9 +- 26 files changed, 564 insertions(+), 748 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 76d585855b8..5b546d42d98 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -514,7 +514,6 @@ namespace { bool tryShiftAmountMod(SDNode *N); bool combineIncDecVector(SDNode *Node); bool tryShrinkShlLogicImm(SDNode *N); - bool tryVPTERNLOG(SDNode *N); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); @@ -3833,82 +3832,6 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } -// Try to match two logic ops to a VPTERNLOG. -// FIXME: Handle inverted inputs? -// FIXME: Handle more complex patterns that use an operand more than once? -// FIXME: Support X86ISD::ANDNP -bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { - MVT NVT = N->getSimpleValueType(0); - - // Make sure we support VPTERNLOG. - if (!NVT.isVector() || !Subtarget->hasAVX512() || - NVT.getVectorElementType() == MVT::i1) - return false; - - // We need VLX for 128/256-bit. - if (!(Subtarget->hasVLX() || NVT.is512BitVector())) - return false; - - unsigned Opc1 = N->getOpcode(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - auto isLogicOp = [](unsigned Opc) { - return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; - }; - - SDValue A, B, C; - unsigned Opc2; - if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { - Opc2 = N1.getOpcode(); - A = N0; - B = N1.getOperand(0); - C = N1.getOperand(1); - } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { - Opc2 = N0.getOpcode(); - A = N1; - B = N0.getOperand(0); - C = N0.getOperand(1); - } else - return false; - - uint64_t Imm; - switch (Opc1) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0x80; break; - case ISD::OR: Imm = 0xe0; break; - case ISD::XOR: Imm = 0x60; break; - } - break; - case ISD::OR: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0xf8; break; - case ISD::OR: Imm = 0xfe; break; - case ISD::XOR: Imm = 0xf6; break; - } - break; - case ISD::XOR: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0x78; break; - case ISD::OR: Imm = 0x1e; break; - case ISD::XOR: Imm = 0x96; break; - } - break; - } - - SDLoc DL(N); - SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, - CurDAG->getTargetConstant(Imm, DL, MVT::i8)); - ReplaceNode(N, New.getNode()); - SelectCode(New.getNode()); - return true; -} - /// Convert vector increment or decrement to sub/add with an all-ones constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> @@ -4580,10 +4503,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case ISD::XOR: if (tryShrinkShlLogicImm(Node)) return; + if (Opcode == ISD::OR && tryMatchBitSelect(Node)) return; - if (tryVPTERNLOG(Node)) - return; LLVM_FALLTHROUGH; case ISD::ADD: diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 10f3d5386bc..6f724738864 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -331,8 +331,8 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { define <8 x double> @ulto8f64(<8 x i64> %a) { ; NODQ-LABEL: ulto8f64: ; NODQ: # %bb.0: -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 @@ -356,20 +356,21 @@ define <16 x double> @ulto16f64(<16 x i64> %a) { ; NODQ-LABEL: ulto16f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm4 +; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] ; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 ; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 -; NODQ-NEXT: vaddpd %zmm0, %zmm4, %zmm0 -; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm3 +; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 +; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 ; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 ; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 ; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 -; NODQ-NEXT: vaddpd %zmm1, %zmm3, %zmm1 +; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto16f64: diff --git a/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/test/CodeGen/X86/avx512-gfni-intrinsics.ll index 7e5e3e8d278..fbb28766f65 100644 --- a/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -7,21 +7,21 @@ define <16 x i8> @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) @@ -37,21 +37,21 @@ define <32 x i8> @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) @@ -67,21 +67,21 @@ define <64 x i8> @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) @@ -97,21 +97,21 @@ define <16 x i8> @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 ; X86-LABEL: test_vgf2p8affineqb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03] -; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x03] ; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03] -; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x03] ; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) @@ -127,21 +127,21 @@ define <32 x i8> @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 ; X86-LABEL: test_vgf2p8affineqb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03] -; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x03] ; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03] -; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x03] ; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) @@ -157,21 +157,21 @@ define <64 x i8> @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 ; X86-LABEL: test_vgf2p8affineqb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03] -; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x03] ; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x03] -; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03] -; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x03] ; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x03] -; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) @@ -187,21 +187,21 @@ define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8 ; X86-LABEL: test_vgf2p8mulb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xe1] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xe1] ; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xe1] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xe1] ; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] +; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) @@ -217,21 +217,21 @@ define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8 ; X86-LABEL: test_vgf2p8mulb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xe1] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xe1] ; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xe1] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xe1] ; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] +; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) @@ -247,21 +247,21 @@ define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8 ; X86-LABEL: test_vgf2p8mulb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xe1] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xd9] +; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xe1] ; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xe1] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xd9] +; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xe1] ; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] +; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) diff --git a/test/CodeGen/X86/machine-combiner-int-vec.ll b/test/CodeGen/X86/machine-combiner-int-vec.ll index 4e07b4abde4..e4420be56b4 100644 --- a/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -13,18 +13,12 @@ define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_and_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_and_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $128, %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_and_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = and <4 x i32> %x2, %t0 @@ -40,18 +34,12 @@ define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> % ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_or_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_or_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $254, %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_or_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = or <4 x i32> %x2, %t0 @@ -67,18 +55,12 @@ define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_xor_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_xor_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $150, %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_xor_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = xor <4 x i32> %x2, %t0 @@ -99,18 +81,12 @@ define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_and_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_and_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $128, %ymm2, %ymm3, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_and_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpand %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = and <8 x i32> %x2, %t0 @@ -129,18 +105,12 @@ define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> % ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_or_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_or_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $254, %ymm2, %ymm3, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_or_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpor %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = or <8 x i32> %x2, %t0 @@ -159,18 +129,12 @@ define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX2-LABEL: reassociate_xor_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: reassociate_xor_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $150, %ymm2, %ymm3, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: reassociate_xor_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpxor %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = xor <8 x i32> %x2, %t0 @@ -211,7 +175,8 @@ define <16 x i32> @reassociate_and_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_and_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $128, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -250,7 +215,8 @@ define <16 x i32> @reassociate_or_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i ; AVX512-LABEL: reassociate_or_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $254, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -289,7 +255,8 @@ define <16 x i32> @reassociate_xor_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_xor_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $150, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpxord %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 diff --git a/test/CodeGen/X86/midpoint-int-vec-128.ll b/test/CodeGen/X86/midpoint-int-vec-128.ll index f78ab3be2db..7f0e19e58a0 100644 --- a/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2263,12 +2263,12 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -3123,14 +3123,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 diff --git a/test/CodeGen/X86/midpoint-int-vec-256.ll b/test/CodeGen/X86/midpoint-int-vec-256.ll index 15723c66777..0d28d6145ce 100644 --- a/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1855,12 +1855,12 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2789,21 +2789,21 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq diff --git a/test/CodeGen/X86/midpoint-int-vec-512.ll b/test/CodeGen/X86/midpoint-int-vec-512.ll index 5414cdd73dd..c3743ca82a1 100644 --- a/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -418,20 +418,21 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm9 -; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm9 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm8, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm9, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -876,44 +877,45 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm4, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9 -; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm9 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm4, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm8, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11],ymm9[12],ymm0[12],ymm9[13],ymm0[13],ymm9[14],ymm0[14],ymm9[15],ymm0[15],ymm9[24],ymm0[24],ymm9[25],ymm0[25],ymm9[26],ymm0[26],ymm9[27],ymm0[27],ymm9[28],ymm0[28],ymm9[29],ymm0[29],ymm9[30],ymm0[30],ymm9[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[4],ymm0[4],ymm9[5],ymm0[5],ymm9[6],ymm0[6],ymm9[7],ymm0[7],ymm9[16],ymm0[16],ymm9[17],ymm0[17],ymm9[18],ymm0[18],ymm9[19],ymm0[19],ymm9[20],ymm0[20],ymm9[21],ymm0[21],ymm9[22],ymm0[22],ymm9[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/sadd_sat_vec.ll b/test/CodeGen/X86/sadd_sat_vec.ll index e3056bdfff8..94c6e46ea96 100644 --- a/test/CodeGen/X86/sadd_sat_vec.ll +++ b/test/CodeGen/X86/sadd_sat_vec.ll @@ -495,49 +495,20 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: v16i4: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: v16i4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: v16i4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } diff --git a/test/CodeGen/X86/ssub_sat_vec.ll b/test/CodeGen/X86/ssub_sat_vec.ll index 3a0031c4214..43e8fca8304 100644 --- a/test/CodeGen/X86/ssub_sat_vec.ll +++ b/test/CodeGen/X86/ssub_sat_vec.ll @@ -495,49 +495,20 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: v16i4: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: v16i4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: v16i4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 1570fdc62a7..6b863456dfa 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -552,8 +552,8 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 @@ -905,8 +905,8 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 @@ -3464,8 +3464,8 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 @@ -3847,8 +3847,8 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-fshl-256.ll b/test/CodeGen/X86/vector-fshl-256.ll index ff77e4efb6a..cf8a80cf9db 100644 --- a/test/CodeGen/X86/vector-fshl-256.ll +++ b/test/CodeGen/X86/vector-fshl-256.ll @@ -1550,10 +1550,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 ; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm3, %ymm4 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/test/CodeGen/X86/vector-fshl-512.ll b/test/CodeGen/X86/vector-fshl-512.ll index 5c3f9da3af1..b6c5d9f744e 100644 --- a/test/CodeGen/X86/vector-fshl-512.ll +++ b/test/CodeGen/X86/vector-fshl-512.ll @@ -867,14 +867,16 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 ; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpternlogq $236, %ymm6, %ymm10, %ymm9 -; AVX512VL-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512VL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm9, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpternlogq $236, %ymm6, %ymm5, %ymm1 +; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -892,11 +894,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 -; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -915,11 +918,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -938,11 +942,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 -; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLBW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -961,11 +966,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 diff --git a/test/CodeGen/X86/vector-fshl-rot-256.ll b/test/CodeGen/X86/vector-fshl-rot-256.ll index 41c538dd7d7..ca624b0a82e 100644 --- a/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -446,11 +446,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i8: @@ -836,11 +837,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/test/CodeGen/X86/vector-fshl-rot-512.ll b/test/CodeGen/X86/vector-fshl-rot-512.ll index 6b7fc3d0339..8cb0f36a176 100644 --- a/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -186,11 +186,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpternlogq $248, %ymm9, %ymm5, %ymm7 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 +; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 @@ -203,10 +204,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $248, %ymm9, %ymm3, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -433,11 +435,13 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -456,11 +460,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: @@ -478,11 +483,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) diff --git a/test/CodeGen/X86/vector-fshr-256.ll b/test/CodeGen/X86/vector-fshr-256.ll index b62675b2bec..8898373bfe8 100644 --- a/test/CodeGen/X86/vector-fshr-256.ll +++ b/test/CodeGen/X86/vector-fshr-256.ll @@ -1538,23 +1538,24 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm4, %xmm3 -; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 +; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 ; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm0, %ymm3 -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/test/CodeGen/X86/vector-fshr-512.ll b/test/CodeGen/X86/vector-fshr-512.ll index 21920e3dc0f..ca559a6911a 100644 --- a/test/CodeGen/X86/vector-fshr-512.ll +++ b/test/CodeGen/X86/vector-fshr-512.ll @@ -838,32 +838,34 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm10 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm5 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7 +; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 +; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm6, %ymm10, %ymm9 -; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm3 -; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 -; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm4, %ymm9 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm10, %ymm9, %ymm4 -; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm6, %ymm1, %ymm5 -; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm0, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm0 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6 +; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -871,20 +873,21 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpsllw %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 -; AVX512BW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq @@ -893,20 +896,21 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 -; AVX512VBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: retq @@ -915,20 +919,21 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm4, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm3 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4 +; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq @@ -937,20 +942,21 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLVBMI2-NEXT: retq diff --git a/test/CodeGen/X86/vector-fshr-rot-256.ll b/test/CodeGen/X86/vector-fshr-rot-256.ll index e617cc05a01..bf7c057965b 100644 --- a/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -487,11 +487,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i8: @@ -911,11 +912,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/test/CodeGen/X86/vector-fshr-rot-512.ll b/test/CodeGen/X86/vector-fshr-rot-512.ll index 9de5bdbbf01..3838dfd4dd1 100644 --- a/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -197,11 +197,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 +; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 @@ -215,10 +216,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $248, %ymm10, %ymm3, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -453,56 +455,60 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512BW-NEXT: vpsllw %xmm2, %xmm5, %xmm2 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm5, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq %zmm2, %zmm4, %zmm2 -; AVX512VLBW-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 550cf3884c9..336311e1b79 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -186,8 +186,9 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm2, %zmm1 +; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -540,8 +541,9 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm3, %zmm2 +; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index f2eb1aef9e3..3acdca7cda5 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -2003,35 +2003,13 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: splatconstant_rotate_mask_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1 -; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 -; AVX512VLBW-NEXT: retq +; AVX512-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v8i16: ; XOP: # %bb.0: @@ -2077,39 +2055,14 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: splatconstant_rotate_mask_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 -; AVX512VLBW-NEXT: retq +; AVX512-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v16i8: ; XOP: # %bb.0: diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll index 158dc3b6ce7..df76a7738f8 100644 --- a/test/CodeGen/X86/vector-rotate-256.ll +++ b/test/CodeGen/X86/vector-rotate-256.ll @@ -442,11 +442,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v32i8: @@ -826,11 +827,12 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v32i8: @@ -1711,35 +1713,13 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: splatconstant_rotate_mask_v16i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1 -; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 -; AVX512VLBW-NEXT: retq +; AVX512-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX1: # %bb.0: @@ -1808,11 +1788,9 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8: @@ -1827,9 +1805,10 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll index 0b858e7dc9e..d92d73a220d 100644 --- a/test/CodeGen/X86/vector-rotate-512.ll +++ b/test/CodeGen/X86/vector-rotate-512.ll @@ -179,11 +179,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 @@ -195,10 +196,11 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm3, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -423,11 +425,13 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -442,11 +446,12 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_rotate_v64i8: @@ -460,11 +465,12 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat @@ -883,27 +889,31 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpternlogq $200, %ymm3, %ymm2, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $200, %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> @@ -938,34 +948,34 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512VL-NEXT: vpternlogq $200, %ymm2, %ymm4, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $200, %ymm2, %ymm4, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index 38c3488187c..8b309827752 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1443,8 +1443,9 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 0d92b72b275..2f059a80765 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -946,14 +946,15 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512DQVL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] -; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512DQVL-NEXT: vpternlogq $108, %ymm1, %ymm2, %ymm0 -; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v32i8: @@ -1631,8 +1632,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll index b4e8e859f2b..1bb62977ca9 100644 --- a/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -212,14 +212,15 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpternlogq $108, %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %shift = ashr <64 x i8> %a, %splat @@ -374,8 +375,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/test/CodeGen/X86/vector-shift-ashr-sub128.ll index d410d497ea5..3f0345ad8b2 100644 --- a/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2354,8 +2354,9 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2407,8 +2408,9 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2460,8 +2462,9 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -- 2.40.0