const X86Subtarget &Subtarget) {
SDLoc DL(N);
- // Pre-shrink oversized index elements to avoid triggering scalarization.
- if (DCI.isBeforeLegalize()) {
+ if (DCI.isBeforeLegalizeOps()) {
SDValue Index = N->getOperand(4);
- if (Index.getScalarValueSizeInBits() > 64) {
- EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
- Index.getValueType().getVectorNumElements());
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Trunc;
- DAG.UpdateNodeOperands(N, NewOps);
- DCI.AddToWorklist(N);
- return SDValue(N, 0);
+ // Remove any sign extends from 32 or smaller to larger than 32.
+ // Only do this before LegalizeOps in case we need the sign extend for
+ // legalization.
+ if (Index.getOpcode() == ISD::SIGN_EXTEND) {
+ if (Index.getScalarValueSizeInBits() > 32 &&
+ Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[4] = Index.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ // The original sign extend has less users, add back to worklist in case
+ // it needs to be removed
+ DCI.AddToWorklist(Index.getNode());
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
}
- }
- // Try to remove sign extends from i32 to i64 on the index.
- // Only do this before legalize in case we are relying on it for
- // legalization.
- // TODO: We should maybe remove any sign extend once we learn how to sign
- // extend narrow index during lowering.
- if (DCI.isBeforeLegalizeOps()) {
- SDValue Index = N->getOperand(4);
- if (Index.getScalarValueSizeInBits() == 64 &&
- Index.getOpcode() == ISD::SIGN_EXTEND &&
- Index.getOperand(0).getScalarValueSizeInBits() == 32) {
+ // Make sure the index is either i32 or i64
+ unsigned ScalarSize = Index.getScalarValueSizeInBits();
+ if (ScalarSize != 32 && ScalarSize != 64) {
+ MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+ EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+ Index.getValueType().getVectorNumElements());
+ Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index.getOperand(0);
+ NewOps[4] = Index;
DAG.UpdateNodeOperands(N, NewOps);
- // The original sign extend has less users, add back to worklist in case
- // it needs to be removed.
- DCI.AddToWorklist(Index.getNode());
DCI.AddToWorklist(N);
return SDValue(N, 0);
}
define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64-LABEL: sext_i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovsxbw %xmm0, %ymm0
-; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm1
-; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: kxnorw %k0, %k0, %k2
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: sext_i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxbw %xmm0, %ymm0
-; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm1
-; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: kxnorw %k0, %k0, %k2
-; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: sext_i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbw %xmm0, %ymm0
-; SKX-NEXT: vpmovsxwq %xmm0, %zmm1
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: kxnorw %k0, %k0, %k2
-; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: sext_i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpmovsxbw %xmm0, %ymm0
-; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm1
-; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: kxnorw %k0, %k0, %k2
-; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <16 x i8> %ind to <16 x i64>
define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: sext_v8i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_64-NEXT: vpsllq $56, %zmm0, %zmm0
-; KNL_64-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
+; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: sext_v8i8_index:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpsllq $56, %zmm0, %zmm0
-; KNL_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
+; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: sext_v8i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; SKX-NEXT: vpsllq $56, %zmm0, %zmm0
-; SKX-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT: vpslld $24, %ymm0, %ymm0
+; SKX-NEXT: vpsrad $24, %ymm0, %ymm1
+; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: sext_v8i8_index:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpsllq $56, %zmm0, %zmm0
-; SKX_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
+; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <8 x i8> %ind to <8 x i64>