; CHECK-LABEL: bcast_unfold_smin_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB72_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpminsd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB72_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smin_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB73_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB73_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smin_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB74_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB74_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smin_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB76_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB76_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smin_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB77_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB77_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smax_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB78_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpmaxsd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB78_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smax_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB79_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB79_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smax_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB80_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpmaxsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB80_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smax_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB82_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpmaxsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB82_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_smax_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB83_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB83_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umin_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB84_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB84_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umin_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB85_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB85_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umin_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB86_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB86_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umin_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB88_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpminuq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB88_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umin_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB89_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB89_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umax_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB90_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpmaxud {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB90_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umax_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB91_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpmaxud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB91_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umax_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB92_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpmaxud {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB92_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umax_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB94_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpmaxuq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB94_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-LABEL: bcast_unfold_umax_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB95_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpmaxuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB95_1
; CHECK-NEXT: # %bb.2: # %bb10