[(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
_.ExeDomain>, EVEX;
- let Constraints = "$src0 = $dst" in {
+ let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src),
MI.getOperand(2));
break;
+
+ case X86::VMOVDQU8Z128rmk:
+ case X86::VMOVDQU8Z256rmk:
+ case X86::VMOVDQU8Zrmk:
+ case X86::VMOVDQU16Z128rmk:
+ case X86::VMOVDQU16Z256rmk:
+ case X86::VMOVDQU16Zrmk:
+ case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
+ case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
+ case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
+ case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
+ case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
+ case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
+ case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
+ case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
+ case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
+ case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
+ case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
+ case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: {
+ unsigned Opc;
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
+ case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
+ case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
+ case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+ case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+ case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
+ case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ }
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .add(MI.getOperand(2))
+ .add(Src)
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(5))
+ .add(MI.getOperand(6))
+ .add(MI.getOperand(7));
+ break;
+ }
+ case X86::VMOVDQU8Z128rrk:
+ case X86::VMOVDQU8Z256rrk:
+ case X86::VMOVDQU8Zrrk:
+ case X86::VMOVDQU16Z128rrk:
+ case X86::VMOVDQU16Z256rrk:
+ case X86::VMOVDQU16Zrrk:
+ case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
+ case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
+ case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
+ case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
+ case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
+ case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
+ case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
+ case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
+ case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
+ case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
+ case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
+ case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
+ unsigned Opc;
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
+ case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
+ case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
+ case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
+ case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
+ case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
+ case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+ case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+ case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
+ case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+ case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+ case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
+ case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
+ case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
+ case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
+ case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
+ case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
+ case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
+ case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+ case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+ case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
+ case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+ case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+ case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
+ case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
+ case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
+ case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
+ case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
+ case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
+ case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
+ }
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .add(MI.getOperand(2))
+ .add(Src)
+ .add(MI.getOperand(3));
+ break;
+ }
}
if (!NewMI) return nullptr;
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
-; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
-; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
-; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; AVX512BW-NEXT: vmovapd %zmm1, %zmm0
-; AVX512BW-NEXT: vmovapd %zmm2, %zmm1
-; AVX512BW-NEXT: vmovapd %zmm3, %zmm2
-; AVX512BW-NEXT: vmovapd %zmm4, %zmm3
+; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovapd %zmm5, %zmm2
; AVX512BW-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
-; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm5 {%k2}
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
-; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
-; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
; AVX512BW-NEXT: retq
%res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
ret <32 x i64> %res
; AVX512: ## BB#0:
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
%res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16i64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
-; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
+; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
; SKX-NEXT: retq
%res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovapd %zmm1, %zmm0
-; AVX512F-NEXT: vmovapd %zmm2, %zmm1
+; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
-; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; SKX-NEXT: vmovapd %zmm1, %zmm0
-; SKX-NEXT: vmovapd %zmm2, %zmm1
+; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; SKX-NEXT: retq
%res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
-; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1}
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2}
+; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k2, %k1
-; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovapd %zmm1, %zmm0
-; AVX512F-NEXT: vmovapd %zmm2, %zmm1
-; AVX512F-NEXT: vmovapd %zmm3, %zmm2
-; AVX512F-NEXT: vmovapd %zmm4, %zmm3
+; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovapd %zmm5, %zmm2
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_32f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
-; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; SKX-NEXT: kshiftrd $16, %k1, %k2
-; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
+; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
; SKX-NEXT: kshiftrw $8, %k1, %k1
-; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; SKX-NEXT: kshiftrw $8, %k2, %k1
-; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; SKX-NEXT: vmovapd %zmm1, %zmm0
-; SKX-NEXT: vmovapd %zmm2, %zmm1
-; SKX-NEXT: vmovapd %zmm3, %zmm2
-; SKX-NEXT: vmovapd %zmm4, %zmm3
+; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
+; SKX-NEXT: vmovapd %zmm5, %zmm2
; SKX-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
; CHECK: ## BB#0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
-; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> %val)
ret <32 x i8> %res
; CHECK: ## BB#0:
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k1
-; CHECK-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
ret <64 x i8> %res
; CHECK: ## BB#0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
-; CHECK-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
ret <32 x i16> %res
}
; X32-LABEL: test_argRet128Vector:
-; X32: vmovdqa{{.*}} %xmm0, %xmm1
-; X32: vmovdqa{{.*}} %xmm1, %xmm0
+; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet128Vector:
-; WIN64: vmovdqa{{.*}} %xmm0, %xmm1
-; WIN64: vmovdqa{{.*}} %xmm1, %xmm0
+; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 128 bit vector
}
; X32-LABEL: test_argRet256Vector:
-; X32: vmovdqa{{.*}} %ymm0, %ymm1
-; X32: vmovdqa{{.*}} %ymm1, %ymm0
+; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet256Vector:
-; WIN64: vmovdqa{{.*}} %ymm0, %ymm1
-; WIN64: vmovdqa{{.*}} %ymm1, %ymm0
+; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 256 bit vector
}
; X32-LABEL: test_argRet512Vector:
-; X32: vmovdqa{{.*}} %zmm0, %zmm1
-; X32: vmovdqa{{.*}} %zmm1, %zmm0
+; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet512Vector:
-; WIN64: vmovdqa{{.*}} %zmm0, %zmm1
-; WIN64: vmovdqa{{.*}} %zmm1, %zmm0
+; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 512 bit vector
; CHECK-LABEL: test1:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
; CHECK-LABEL: test3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
; CHECK-LABEL: test4_unsigned:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
; CHECK-LABEL: test5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
; CHECK-LABEL: test6_unsigned:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
; SKX: ## BB#0:
; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = fcmp olt <4 x float> %a, zeroinitializer
; SKX: ## BB#0:
; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovapd %xmm1, %xmm0
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = fcmp olt <2 x double> %a, zeroinitializer
%c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovdqa %ymm1, %ymm0
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovdqa %ymm1, %ymm0
+; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %ymm1, %ymm0
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
-; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = fcmp oeq <8 x float> %x, %y
; CHECK-LABEL: test16:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
-; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
; CHECK-LABEL: test17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
; CHECK-LABEL: test18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
; CHECK-LABEL: test19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
; CHECK-LABEL: test24:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
; CHECK-LABEL: test25:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
-; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovapd %ymm1, %ymm0
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = fcmp oeq <4 x double> %x, %y
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
-; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovapd %xmm1, %xmm0
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%y = load <2 x double>, <2 x double>* %yp, align 4
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
-; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovapd %ymm1, %ymm0
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%y = load <4 x double>, <4 x double>* %yp, align 4
; CHECK-LABEL: test33:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
-; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vmovups (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %ymm1, %ymm0
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
-; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%y = load <8 x float>, <8 x float>* %yp, align 4
; CHECK-LABEL: test36:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
; CHECK-LABEL: test37:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%a = load double, double* %ptr
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
-; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovapd %ymm1, %ymm0
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%a = load double, double* %ptr
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
-; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovapd %xmm1, %xmm0
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%a = load double, double* %ptr
; CHECK-LABEL: test40:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%a = load float, float* %ptr
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vbroadcastss (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %ymm1, %ymm0
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
-; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
-; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%a = load float, float* %ptr
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
-; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%a = load float, float* %ptr
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovapd %zmm1, %zmm0
+; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test43:
; SKX-NEXT: vpsllw $15, %xmm2, %xmm2
; SKX-NEXT: vpmovw2m %xmm2, %k1
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; SKX-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; SKX-NEXT: vmovapd %zmm1, %zmm0
+; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
%a = load double, double* %ptr
; CHECK-LABEL: test1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sgt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
; CHECK-LABEL: test3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1
-; CHECK-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
; CHECK-LABEL: test4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
-; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
; CHECK-LABEL: test5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %yp, align 4
%mask = icmp eq <32 x i16> %x, %y
; CHECK-LABEL: test6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sgt <32 x i16> %x, %y
; CHECK-LABEL: test7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sle <32 x i16> %x, %y
; CHECK-LABEL: test8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp ule <32 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <32 x i16> %x1, %y1
%mask0 = icmp eq <32 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <64 x i8> %x1, %y1
%mask0 = icmp sle <64 x i8> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <64 x i8> %x1, %y1
%y = load <64 x i8>, <64 x i8>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i16> %x1, %y1
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
; CHECK-LABEL: test256_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
; CHECK-LABEL: test256_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sgt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
; CHECK-LABEL: test256_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1
-; CHECK-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
; CHECK-LABEL: test256_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
; CHECK-LABEL: test256_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %yp, align 4
%mask = icmp eq <16 x i16> %x, %y
; CHECK-LABEL: test256_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sgt <16 x i16> %x, %y
; CHECK-LABEL: test256_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sle <16 x i16> %x, %y
; CHECK-LABEL: test256_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp ule <16 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i16> %x1, %y1
%mask0 = icmp eq <16 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i8> %x1, %y1
%mask0 = icmp sle <32 x i8> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <32 x i8> %x1, %y1
%y = load <32 x i8>, <32 x i8>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i16> %x1, %y1
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
; CHECK-LABEL: test128_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
; CHECK-LABEL: test128_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sgt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
; CHECK-LABEL: test128_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
; CHECK-LABEL: test128_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
; CHECK-LABEL: test128_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %yp, align 4
%mask = icmp eq <8 x i16> %x, %y
; CHECK-LABEL: test128_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sgt <8 x i16> %x, %y
; CHECK-LABEL: test128_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sle <8 x i16> %x, %y
; CHECK-LABEL: test128_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp ule <8 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i16> %x1, %y1
%mask0 = icmp eq <8 x i16> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i8> %x1, %y1
%mask0 = icmp sle <16 x i8> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <16 x i8> %x1, %y1
%y = load <16 x i8>, <16 x i8>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i16> %x1, %y1
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
; CHECK-LABEL: test256_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
; CHECK-LABEL: test256_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
; CHECK-LABEL: test256_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
; CHECK-LABEL: test256_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
; CHECK-LABEL: test256_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
; CHECK-LABEL: test256_5b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %y, %x
; CHECK-LABEL: test256_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
; CHECK-LABEL: test256_6b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp slt <8 x i32> %y, %x
; CHECK-LABEL: test256_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
; CHECK-LABEL: test256_7b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sge <8 x i32> %y, %x
; CHECK-LABEL: test256_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
; CHECK-LABEL: test256_8b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp uge <8 x i32> %y, %x
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
; CHECK-LABEL: test256_13:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
; CHECK-LABEL: test256_14:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
; CHECK-LABEL: test256_17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %x, %y
; CHECK-LABEL: test256_18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %y, %x
; CHECK-LABEL: test256_19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %x, %y
; CHECK-LABEL: test256_20:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %y, %x
; CHECK-LABEL: test128_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
; CHECK-LABEL: test128_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
; CHECK-LABEL: test128_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
; CHECK-LABEL: test128_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
; CHECK-LABEL: test128_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
; CHECK-LABEL: test128_5b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %y, %x
; CHECK-LABEL: test128_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
; CHECK-LABEL: test128_6b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp slt <4 x i32> %y, %x
; CHECK-LABEL: test128_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
; CHECK-LABEL: test128_7b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sge <4 x i32> %y, %x
; CHECK-LABEL: test128_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
; CHECK-LABEL: test128_8b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
; CHECK-LABEL: test128_13:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
; CHECK-LABEL: test128_14:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
; CHECK-LABEL: test128_17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %x, %y
; CHECK-LABEL: test128_18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %y, %x
; CHECK-LABEL: test128_19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %x, %y
; CHECK-LABEL: test128_20:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
; SKX: ## BB#0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovupd (%rdi), %xmm1 {%k1}
-; SKX-NEXT: vmovapd %xmm1, %xmm0
+; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
; SKX: ## BB#0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
; SKX: ## BB#0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1}
-; SKX-NEXT: vmovdqa %xmm1, %xmm0
+; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
; SKX: ## BB#0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
-; SKX-NEXT: vmovapd (%rdi), %ymm1 {%k1}
-; SKX-NEXT: vmovapd %ymm1, %ymm0
+; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
-; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1}
-; AVX512F-NEXT: vmovaps %ymm1, %ymm0
+; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11a:
; SKX: ## BB#0:
; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
-; SKX-NEXT: vmovaps (%rdi), %ymm1 {%k1}
-; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
-; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11b:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1}
-; SKX-NEXT: vmovdqa %ymm1, %ymm0
+; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
ret <8 x i32> %res
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
; SKX-NEXT: kshiftlw $14, %k0, %k0
; SKX-NEXT: kshiftrw $14, %k0, %k1
-; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
ret <16 x i8> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
ret <16 x i16> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $1, %ax
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
ret <16 x i16> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55
; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
ret <16 x i16> %shuffle
; AVX512VL: # BB#0:
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
ret <32 x i8> %shuffle
; ALL: # BB#0:
; ALL-NEXT: movw $8, %ax
; ALL-NEXT: kmovw %eax, %k1
-; ALL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; ALL-NEXT: vmovdqa64 %zmm1, %zmm0
+; ALL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i32> %c
; CHECK: # BB#0:
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
%mask.cast = bitcast i8 %mask to <8 x i1>