%4 = bitcast <16 x i8> %3 to <8 x i16>
ret <8 x i16> %4
}
+
+define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_pshufb_insertqi_pshufb:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u]
+; SSE-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,1,2,4,3,u,u,0,u,u,u,u,u,u,u,u]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_insertqi_pshufb:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u]
+; AVX-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,1,2,4,3,u,u,0,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %3
+}