setOperationAction(ISD::FSHR, VT, Custom);
}
}
+
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
}
// We want to custom lower some of our intrinsics.
"Invalid TRUNCATE operation");
// If called by the legalizer just return.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) {
+ if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) {
+ assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+ // The default behavior is to truncate one step, concatenate, and then
+ // truncate the remainder. We'd rather produce two 64-bit results and
+ // concatenate those.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Otherwise let default legalization handle it.
return SDValue();
+ }
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
-; CHECK-NEXT: vpmovdw %ymm1, %xmm1
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpmovwb %ymm0, %xmm0
+; CHECK-NEXT: vpmovdb %ymm1, %xmm1
+; CHECK-NEXT: vpmovdb %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
-; CHECK-NEXT: vpmovqd %ymm1, %xmm1
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
+; CHECK-NEXT: vpmovqw %ymm1, %xmm1
+; CHECK-NEXT: vpmovqw %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x