ret i8 %4
}
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
ret i16 %4
}
-
-define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1808:
-; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1809:
-; NoVLX-NEXT: .cfi_offset %rbp, -16
-; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1810:
-; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: movq %rbp, %rsp
-; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
%2 = fcmp oeq <4 x float> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
-define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1811:
-; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1812:
-; NoVLX-NEXT: .cfi_offset %rbp, -16
-; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1813:
-; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1808:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1809:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1810:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1811:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1812:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1813:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
ret i32 %4
}
-
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1817:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
%2 = fcmp oeq <4 x float> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1820:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%load = load <2 x i64>, <2 x i64>* %__b
%1 = bitcast <2 x i64> %load to <4 x float>
%2 = fcmp oeq <4 x float> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1823:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
-; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%vec = insertelement <4 x float> undef, float %load, i32 0
%1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <4 x float> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $8, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %1 = bitcast <4 x i64> %__b to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
-}
-
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vmovaps (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $8, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
-}
-
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $8, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %load = load float, float* %__b
- %vec = insertelement <8 x float> undef, float %load, i32 0
- %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
-}
-
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: vzeroupper
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1826:
; NoVLX-NEXT: .Lcfi1828:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %1 = bitcast <4 x i64> %__b to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: vzeroupper
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1829:
; NoVLX-NEXT: .Lcfi1831:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vmovaps (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: vzeroupper
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1832:
; NoVLX-NEXT: .Lcfi1834:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
+ %0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
- %vec = insertelement <8 x float> undef, float %load, i32 0
- %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
-; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1835:
; NoVLX-NEXT: .Lcfi1837:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %1 = bitcast <4 x i64> %__b to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
-; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1838:
; NoVLX-NEXT: .Lcfi1840:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vmovaps (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <8 x float>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <8 x float>
- %2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
-; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1841:
; NoVLX-NEXT: .Lcfi1843:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm3
+; NoVLX-NEXT: vcmpeqps %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
%vec = insertelement <8 x float> undef, float %load, i32 0
%1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <8 x float> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
; NoVLX-NEXT: .Lcfi1844:
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .Lcfi1846:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1847:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1848:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1849:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1850:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1851:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %esi
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %edi
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %1 = bitcast <8 x i64> %__b to <16 x float>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1852:
+; NoVLX-NEXT: .Lcfi1847:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1853:
+; NoVLX-NEXT: .Lcfi1848:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1854:
+; NoVLX-NEXT: .Lcfi1849:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1855:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1856:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1857:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1858:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1859:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %load = load <8 x i64>, <8 x i64>* %__b
- %1 = bitcast <8 x i64> %load to <16 x float>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1860:
+; NoVLX-NEXT: .Lcfi1850:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1861:
+; NoVLX-NEXT: .Lcfi1851:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1862:
+; NoVLX-NEXT: .Lcfi1852:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1863:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1864:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1865:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1866:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1867:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
+ %0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
- %vec = insertelement <16 x float> undef, float %load, i32 0
- %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
-
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovw %k0, %eax
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %1 = bitcast <8 x i64> %__b to <16 x float>
- %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
- %3 = zext i16 %2 to i32
- ret i32 %3
-}
-
-
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1868:
+; NoVLX-NEXT: .Lcfi1853:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1869:
+; NoVLX-NEXT: .Lcfi1854:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1870:
+; NoVLX-NEXT: .Lcfi1855:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1871:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1872:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1873:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1874:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1875:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %1 = bitcast <8 x i64> %__b to <16 x float>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1876:
+; NoVLX-NEXT: .Lcfi1856:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1877:
+; NoVLX-NEXT: .Lcfi1857:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1878:
+; NoVLX-NEXT: .Lcfi1858:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1879:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1880:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1881:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1882:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1883:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %load = load <8 x i64>, <8 x i64>* %__b
- %1 = bitcast <8 x i64> %load to <16 x float>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1884:
+; NoVLX-NEXT: .Lcfi1859:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1885:
+; NoVLX-NEXT: .Lcfi1860:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1886:
+; NoVLX-NEXT: .Lcfi1861:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1887:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1888:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1889:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1890:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1891:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %r9d
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %edx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %esi
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %edi
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
+
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1862:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1863:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1864:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %load = load float, float* %__b
- %vec = insertelement <16 x float> undef, float %load, i32 0
- %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <16 x float> %0, %1
- %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <64 x i1> %3 to i64
ret i64 %4
}
-
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: movzwl %ax, %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__a to <16 x float>
- %1 = bitcast <8 x i64> %__b to <16 x float>
- %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
- %3 = zext i16 %2 to i64
- ret i64 %3
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1865:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1866:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1867:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1868:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1869:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1870:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1871:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1872:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1873:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1874:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1875:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1876:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1877:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1878:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1879:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1880:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1881:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1882:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1883:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1884:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1885:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1886:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1887:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1888:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1889:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1890:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1891:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1892:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1893:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1894:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1895:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1896:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1897:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1898:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1899:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1900:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1901:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1902:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1903:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1904:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1905:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1906:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1907:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1908:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1909:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1910:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1911:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1912:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1913:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1914:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1915:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1916:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1917:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1918:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1919:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1920:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1921:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1922:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1923:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1924:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1925:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1926:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1927:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovw %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+ %3 = zext i16 %2 to i32
+ ret i32 %3
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovw %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
+ %3 = zext i16 %2 to i32
+ ret i32 %3
+}
+
+
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1928:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1929:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1930:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1931:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1932:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1933:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1934:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1935:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1936:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1937:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1938:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1939:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1940:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1941:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1942:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1943:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1944:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1945:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1946:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1947:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1948:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1949:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1950:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1951:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1952:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1953:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1954:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1955:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1956:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1957:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1958:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1959:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1960:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1961:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1962:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1963:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1964:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1965:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1966:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1967:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1968:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1969:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1970:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1971:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1972:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1973:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1974:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1975:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+ %3 = zext i16 %2 to i64
+ ret i64 %3
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
+ %3 = zext i16 %2 to i64
+ ret i64 %3
+}
+
+
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = bitcast <4 x i1> %3 to i4
+ ret i4 %4
+}
+
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = bitcast <4 x i1> %3 to i4
+ ret i4 %4
+}
+
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = bitcast <4 x i1> %3 to i4
+ ret i4 %4
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1976:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1977:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1978:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1979:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1980:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1981:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1982:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1983:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1984:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1985:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1986:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1987:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1988:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1989:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1990:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1991:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1992:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1993:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1994:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1995:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1996:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1997:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1998:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1999:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2000:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2001:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2002:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2003:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2004:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2005:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2006:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2007:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2008:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2009:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2010:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2011:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-
-declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %1 = bitcast <2 x i64> %__b to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = bitcast <4 x i1> %3 to i4
- ret i4 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = bitcast <4 x i1> %3 to i4
- ret i4 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load double, double* %__b
- %vec = insertelement <2 x double> undef, double %load, i32 0
- %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = bitcast <4 x i1> %3 to i4
- ret i4 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
-
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kshiftrw $1, %k1, %k1
; NoVLX-NEXT: kshiftlw $1, %k1, %k1
; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %1 = bitcast <2 x i64> %__b to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+
+
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2012:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2013:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2014:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
}
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2015:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2016:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2017:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load double, double* %__b
- %vec = insertelement <2 x double> undef, double %load, i32 0
- %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
}
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2018:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2019:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2020:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2021:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2022:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2023:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %1 = bitcast <2 x i64> %__b to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2024:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2025:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2026:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2027:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2028:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2029:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
+ %0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
- %vec = insertelement <2 x double> undef, double %load, i32 0
- %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1892:
+; NoVLX-NEXT: .Lcfi2030:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1893:
+; NoVLX-NEXT: .Lcfi2031:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1894:
+; NoVLX-NEXT: .Lcfi2032:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %1 = bitcast <2 x i64> %__b to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1895:
+; NoVLX-NEXT: .Lcfi2033:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1896:
+; NoVLX-NEXT: .Lcfi2034:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1897:
+; NoVLX-NEXT: .Lcfi2035:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1898:
+; NoVLX-NEXT: .Lcfi2036:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1899:
+; NoVLX-NEXT: .Lcfi2037:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1900:
+; NoVLX-NEXT: .Lcfi2038:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
+ %0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
- %vec = insertelement <2 x double> undef, double %load, i32 0
- %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1901:
+; NoVLX-NEXT: .Lcfi2039:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1902:
+; NoVLX-NEXT: .Lcfi2040:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1903:
+; NoVLX-NEXT: .Lcfi2041:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %1 = bitcast <2 x i64> %__b to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1904:
+; NoVLX-NEXT: .Lcfi2042:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1905:
+; NoVLX-NEXT: .Lcfi2043:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1906:
+; NoVLX-NEXT: .Lcfi2044:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <2 x double>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1907:
+; NoVLX-NEXT: .Lcfi2045:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1908:
+; NoVLX-NEXT: .Lcfi2046:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1909:
+; NoVLX-NEXT: .Lcfi2047:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm3
+; NoVLX-NEXT: vcmpeqpd %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <2 x i64> %__a to <2 x double>
+ %0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
- %vec = insertelement <2 x double> undef, double %load, i32 0
- %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
- %2 = fcmp oeq <2 x double> %0, %1
- %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %1 = bitcast <4 x i64> %__b to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
+ %0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
- %vec = insertelement <4 x double> undef, double %load, i32 0
- %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <8 x i1> %3 to i8
- ret i8 %4
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
}
-
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %1 = bitcast <4 x i64> %__b to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load double, double* %__b
- %vec = insertelement <4 x double> undef, double %load, i32 0
- %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
+ %3 = zext i8 %2 to i16
+ ret i16 %3
}
-
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1910:
-; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1911:
-; NoVLX-NEXT: .cfi_offset %rbp, -16
-; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1912:
-; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: movq %rbp, %rsp
-; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %1 = bitcast <4 x i64> %__b to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i16
+ ret i16 %3
}
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+
+
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1913:
+; NoVLX-NEXT: .Lcfi2048:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1914:
+; NoVLX-NEXT: .Lcfi2049:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1915:
+; NoVLX-NEXT: .Lcfi2050:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1916:
+; NoVLX-NEXT: .Lcfi2051:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1917:
+; NoVLX-NEXT: .Lcfi2052:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1918:
+; NoVLX-NEXT: .Lcfi2053:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load double, double* %__b
- %vec = insertelement <4 x double> undef, double %load, i32 0
- %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
-
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1919:
+; NoVLX-NEXT: .Lcfi2054:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1920:
+; NoVLX-NEXT: .Lcfi2055:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1921:
+; NoVLX-NEXT: .Lcfi2056:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %1 = bitcast <4 x i64> %__b to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
}
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1922:
+; NoVLX-NEXT: .Lcfi2057:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1923:
+; NoVLX-NEXT: .Lcfi2058:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1924:
+; NoVLX-NEXT: .Lcfi2059:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load <4 x i64>, <4 x i64>* %__b
- %1 = bitcast <4 x i64> %load to <4 x double>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1925:
+; NoVLX-NEXT: .Lcfi2060:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1926:
+; NoVLX-NEXT: .Lcfi2061:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1927:
+; NoVLX-NEXT: .Lcfi2062:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <4 x i64> %__a to <4 x double>
- %load = load double, double* %__b
- %vec = insertelement <4 x double> undef, double %load, i32 0
- %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
-}
-
-
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
- %1 = bitcast <8 x i64> %__b to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi2063:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi2064:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2065:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
- %load = load <8 x i64>, <8 x i64>* %__b
- %1 = bitcast <8 x i64> %load to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
}
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+
+
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
- %load = load double, double* %__b
- %vec = insertelement <8 x double> undef, double %load, i32 0
- %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
+ %3 = zext i8 %2 to i32
+ ret i32 %3
}
-
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: movzbl %al, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
- %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
- %3 = zext i8 %2 to i16
- ret i16 %3
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i32
+ ret i32 %3
}
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
; VLX: # BB#0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1928:
+; NoVLX-NEXT: .Lcfi2066:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1929:
+; NoVLX-NEXT: .Lcfi2067:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1930:
+; NoVLX-NEXT: .Lcfi2068:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1931:
+; NoVLX-NEXT: .Lcfi2069:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1932:
+; NoVLX-NEXT: .Lcfi2070:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1933:
+; NoVLX-NEXT: .Lcfi2071:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%load = load <8 x i64>, <8 x i64>* %__b
%1 = bitcast <8 x i64> %load to <8 x double>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1934:
+; NoVLX-NEXT: .Lcfi2072:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1935:
+; NoVLX-NEXT: .Lcfi2073:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1936:
+; NoVLX-NEXT: .Lcfi2074:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
%vec = insertelement <8 x double> undef, double %load, i32 0
%1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <32 x i1> %3 to i32
- ret i32 %4
-}
-
-
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovb %k0, %eax
-; VLX-NEXT: vzeroupper
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: movzbl %al, %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__a to <8 x double>
- %1 = bitcast <8 x i64> %__b to <8 x double>
- %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
- %3 = zext i8 %2 to i32
- ret i32 %3
+ %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
}
-
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1937:
+; NoVLX-NEXT: .Lcfi2075:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1938:
+; NoVLX-NEXT: .Lcfi2076:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1939:
+; NoVLX-NEXT: .Lcfi2077:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1940:
+; NoVLX-NEXT: .Lcfi2078:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1941:
+; NoVLX-NEXT: .Lcfi2079:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1942:
+; NoVLX-NEXT: .Lcfi2080:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
%load = load <8 x i64>, <8 x i64>* %__b
%1 = bitcast <8 x i64> %load to <8 x double>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1943:
+; NoVLX-NEXT: .Lcfi2081:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1944:
+; NoVLX-NEXT: .Lcfi2082:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1945:
+; NoVLX-NEXT: .Lcfi2083:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
%vec = insertelement <8 x double> undef, double %load, i32 0
%1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <8 x double> %0, %1
- %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %4 = bitcast <64 x i1> %3 to i64
- ret i64 %4
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
}
+
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
; VLX: # BB#0: # %entry
ret i64 %3
}
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i64
+ ret i64 %3
+}
+