From 4d65e3204f44c877c777dd1d88ed7990a539bdf1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 1 May 2019 12:48:42 +0000 Subject: [PATCH] [X86][SSE] Split 512-bit -> 128-bit vector directly in SimplifyDemandedVectorEltsForTargetNode git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359678 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 5 +- test/CodeGen/X86/vector-reduce-mul-widen.ll | 212 ++++++++++---------- test/CodeGen/X86/vector-reduce-mul.ll | 212 ++++++++++---------- 3 files changed, 220 insertions(+), 209 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 73eb5c80aaf..657832ab74b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -33403,12 +33403,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not // demand any of the high elements, then narrow the op to 128/256-bits: e.g. // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 - // TODO: Handle 512-bit -> 128-bit ops as well. if ((VT.is256BitVector() || VT.is512BitVector()) && DemandedElts.lshr(NumElts / 2) == 0) { unsigned SizeInBits = VT.getSizeInBits(); unsigned ExtSizeInBits = SizeInBits / 2; + // See if 512-bit ops only use the bottom 128-bits. + if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) + ExtSizeInBits = SizeInBits / 4; + switch (Opc) { // Target Shuffles. case X86ISD::PSHUFB: diff --git a/test/CodeGen/X86/vector-reduce-mul-widen.ll b/test/CodeGen/X86/vector-reduce-mul-widen.ll index 2181dfc45be..7a7d96d2d58 100644 --- a/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -2513,36 +2513,37 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2563,36 +2564,37 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -3093,36 +3095,37 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -3152,36 +3155,37 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll index 4da213503f7..a533959b8f2 100644 --- a/test/CodeGen/X86/vector-reduce-mul.ll +++ b/test/CodeGen/X86/vector-reduce-mul.ll @@ -2465,36 +2465,37 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2515,36 +2516,37 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -3045,36 +3047,37 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -3104,36 +3107,37 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -- 2.40.0