setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
// We want to custom lower some of our intrinsics.
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
- // If called by the legalizer just return.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) {
- if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) {
+ // If we're called by the type legalizer, handle a few cases.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(InVT)) {
+ if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
+ VT.is128BitVector()) {
assert(Subtarget.hasVLX() && "Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
// truncate the remainder. We'd rather produce two 64-bit results and
return;
}
}
+ if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
+ isTypeLegal(MVT::v4i64)) {
+ // Input needs to be split and output needs to widened. Let's use two
+ // VTRUNCs, and shuffle their results together into the wider type.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
+
+ Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
+ Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
+ SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ -1, -1, -1, -1, -1, -1, -1, -1 });
+ Results.push_back(Res);
+ return;
+ }
+
return;
}
case ISD::ANY_EXTEND:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
-; CHECK-NEXT: vpmovqd %ymm2, %xmm2
-; CHECK-NEXT: vpmovqd %ymm3, %xmm3
-; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; CHECK-NEXT: vpmovdb %ymm2, %xmm2
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
-; CHECK-NEXT: vpmovqd %ymm1, %xmm1
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpmovdb %ymm0, %xmm0
+; CHECK-NEXT: vpmovqb %ymm3, %xmm3
+; CHECK-NEXT: vpmovqb %ymm2, %xmm2
+; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: vpmovqb %ymm1, %xmm1
+; CHECK-NEXT: vpmovqb %ymm0, %xmm0
+; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
}
define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
-; CHECK-AVX512-LABEL: trunc_v8i64_v8i8:
-; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
-; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1
-; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0
-; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-AVX512-NEXT: vzeroupper
-; CHECK-AVX512-NEXT: retq
-;
-; CHECK-VBMI-LABEL: trunc_v8i64_v8i8:
-; CHECK-VBMI: # %bb.0:
-; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
-; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
-; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-VBMI-NEXT: vzeroupper
-; CHECK-VBMI-NEXT: retq
+; CHECK-LABEL: trunc_v8i64_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT: vpmovqb %ymm1, %xmm1
+; CHECK-NEXT: vpmovqb %ymm0, %xmm0
+; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = trunc <8 x i64> %a to <8 x i8>
ret <8 x i8> %b
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_packus_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_packus_v16i64_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_packus_v16i64_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1
+; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_packus_v16i64_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1
+; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = icmp sgt <16 x i64> %2, zeroinitializer
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_ssat_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
-; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1
+; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
+; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
%3 = icmp sgt <16 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_usat_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_usat_v16i64_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpminuq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_usat_v16i64_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1
+; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_usat_v16i64_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpminuq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpminuq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%1 = icmp ult <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = trunc <16 x i64> %2 to <16 x i8>