; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT: vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT: kunpckdq %k0, %k1, %k1
-; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckdq %k0, %k1, %k1
-; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovq %k0, %rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <64 x i8>
- %1 = bitcast <8 x i64> %__A to <64 x i8>
- %2 = icmp ne <64 x i8> %0, %1
- %3 = bitcast <64 x i1> %2 to i64
- %4 = bitcast <8 x i64> %__C to <64 x i8>
- %5 = bitcast <8 x i64> %__D to <64 x i8>
- %6 = icmp ne <64 x i8> %4, %5
- %7 = bitcast <64 x i1> %6 to i64
- %and.i = and i64 %7, 4294967295
- %shl.i = shl i64 %3, 32
- %or.i = or i64 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <64 x i8>
- %9 = bitcast <8 x i64> %__F to <64 x i8>
- %10 = icmp ne <64 x i8> %8, %9
- %11 = bitcast i64 %or.i to <64 x i1>
- %12 = and <64 x i1> %10, %11
- %13 = bitcast <64 x i1> %12 to i64
- ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT: kunpckwd %k0, %k1, %k1
-; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT: kmovd %k0, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckwd %k0, %k1, %k1
-; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovd %k0, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <32 x i16>
- %1 = bitcast <8 x i64> %__A to <32 x i16>
- %2 = icmp ne <32 x i16> %0, %1
- %3 = bitcast <32 x i1> %2 to i32
- %4 = bitcast <8 x i64> %__C to <32 x i16>
- %5 = bitcast <8 x i64> %__D to <32 x i16>
- %6 = icmp ne <32 x i16> %4, %5
- %7 = bitcast <32 x i1> %6 to i32
- %and.i = and i32 %7, 65535
- %shl.i = shl i32 %3, 16
- %or.i = or i32 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <32 x i16>
- %9 = bitcast <8 x i64> %__F to <32 x i16>
- %10 = icmp ne <32 x i16> %8, %9
- %11 = bitcast i32 %or.i to <32 x i1>
- %12 = and <32 x i1> %10, %11
- %13 = bitcast <32 x i1> %12 to i32
- ret i32 %13
-}
-
-
define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
; X32-LABEL: test_mm512_mask_set1_epi8:
; X32: # %bb.0: # %entry
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al