ret <16 x i8> %res1
}
+; TODO - we could fold the load if we lowered to pshuflw instead.
+define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
+; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res0 = load <16 x i8>, <16 x i8> *%a0, align 16
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %res1
+}
+
define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
; SSE: # BB#0: