From 134df240d73626a8fbd2daabed71fb0a4219a51f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 14 Oct 2017 19:46:08 +0000 Subject: [PATCH] [X86][SSE] Test vector imul reduction on 32 and 64-bit targets git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315824 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/shrink_vmul.ll | 1204 ++++++++++++++++++++++--------- 1 file changed, 846 insertions(+), 358 deletions(-) diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index a7682f2275e..504c8936442 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 @c = external global i32*, align 8 @@ -10,20 +11,42 @@ ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi8: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movzwl (%edx,%ecx), %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: movzwl (%eax,%ecx), %eax +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm1 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -48,18 +71,38 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_4xi8: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_4xi8: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_4xi8: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -84,21 +127,44 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_8xi8: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_8xi8: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_8xi8: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -123,31 +189,64 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_16xi8: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 -; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; CHECK-NEXT: movdqa %xmm1, %xmm4 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; CHECK-NEXT: pmullw %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm4, %xmm3 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_16xi8: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-NEXT: movdqa %xmm1, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: pmullw %xmm3, %xmm4 +; X86-NEXT: movdqa %xmm4, %xmm3 +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_16xi8: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-NEXT: pmullw %xmm3, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm3 +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -172,17 +271,36 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi16: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pmulhuw %xmm0, %xmm2 +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pmulhuw %xmm0, %xmm2 +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -207,17 +325,36 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_4xi16: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_4xi16: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pmulhuw %xmm0, %xmm2 +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_4xi16: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pmulhuw %xmm0, %xmm2 +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -242,20 +379,42 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_8xi16: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 -; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_8xi16: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pmulhuw %xmm0, %xmm2 +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_8xi16: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pmulhuw %xmm0, %xmm2 +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -280,30 +439,62 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_16xi16: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 -; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 -; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pmulhuw %xmm0, %xmm4 -; CHECK-NEXT: pmullw %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: pmulhuw %xmm1, %xmm4 -; CHECK-NEXT: pmullw %xmm1, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_16xi16: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-NEXT: movdqa %xmm2, %xmm4 +; X86-NEXT: pmulhuw %xmm0, %xmm4 +; X86-NEXT: pmullw %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-NEXT: movdqa %xmm3, %xmm4 +; X86-NEXT: pmulhuw %xmm1, %xmm4 +; X86-NEXT: pmullw %xmm1, %xmm3 +; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_16xi16: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-NEXT: movdqa %xmm2, %xmm4 +; X64-NEXT: pmulhuw %xmm0, %xmm4 +; X64-NEXT: pmullw %xmm0, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: pmulhuw %xmm1, %xmm4 +; X64-NEXT: pmullw %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -328,22 +519,46 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi8_sext: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm1 -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_sext: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movzwl (%edx,%ecx), %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: movzwl (%eax,%ecx), %eax +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm1 +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_sext: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm1 +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: psrad $16, %xmm0 +; X64-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -368,23 +583,48 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi8_sext_zext: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm0 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pmulhw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_sext_zext: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movzwl (%edx,%ecx), %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: movzwl (%eax,%ecx), %eax +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm0 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pmulhw %xmm0, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_sext_zext: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-NEXT: movd %ecx, %xmm1 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm0 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pmulhw %xmm0, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -409,17 +649,36 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi16_sext: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pmulhw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm0, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_sext: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pmulhw %xmm0, %xmm2 +; X86-NEXT: pmullw %xmm0, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_sext: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pmulhw %xmm0, %xmm2 +; X64-NEXT: pmullw %xmm0, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -444,30 +703,62 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_2xi16_sext_zext: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: psrlq $32, %xmm2 -; CHECK-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: psrlq $32, %xmm3 -; CHECK-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 -; CHECK-NEXT: psllq $32, %xmm3 -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: paddq %xmm3, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_sext_zext: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: psrlq $32, %xmm2 +; X86-NEXT: pmuludq %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: psrlq $32, %xmm3 +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: paddq %xmm2, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: paddq %xmm3, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_sext_zext: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-NEXT: psrad $16, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrlq $32, %xmm2 +; X64-NEXT: pmuludq %xmm0, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psrlq $32, %xmm3 +; X64-NEXT: pmuludq %xmm1, %xmm3 +; X64-NEXT: paddq %xmm2, %xmm3 +; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: pmuludq %xmm0, %xmm1 +; X64-NEXT: paddq %xmm3, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -492,30 +783,62 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; CHECK-LABEL: mul_16xi16_sext: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 -; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 -; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pmulhw %xmm0, %xmm4 -; CHECK-NEXT: pmullw %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: pmulhw %xmm1, %xmm4 -; CHECK-NEXT: pmullw %xmm1, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_16xi16_sext: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl c, %esi +; X86-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-NEXT: movdqa %xmm2, %xmm4 +; X86-NEXT: pmulhw %xmm0, %xmm4 +; X86-NEXT: pmullw %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-NEXT: movdqa %xmm3, %xmm4 +; X86-NEXT: pmulhw %xmm1, %xmm4 +; X86-NEXT: pmullw %xmm1, %xmm3 +; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: mul_16xi16_sext: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-NEXT: movdqa %xmm2, %xmm4 +; X64-NEXT: pmulhw %xmm0, %xmm4 +; X64-NEXT: pmullw %xmm0, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: pmulhw %xmm1, %xmm4 +; X64-NEXT: pmullw %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -539,17 +862,31 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst1: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst1: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -569,18 +906,33 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm0 -; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst2: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm0 +; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst2: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm0 +; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-NEXT: psrad $16, %xmm0 +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -600,20 +952,37 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst3: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst3: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -633,20 +1002,37 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst4: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst4: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst4: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -666,20 +1052,37 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst5: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst5: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst5: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -699,20 +1102,37 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi8_varconst6: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-NEXT: psraw $8, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi8_varconst6: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movzwl (%ecx,%eax), %ecx +; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: psraw $8, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi8_varconst6: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: psraw $8, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -732,17 +1152,31 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi16_varconst1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhuw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_varconst1: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhuw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_varconst1: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhuw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -762,17 +1196,31 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi16_varconst2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmulhw %xmm1, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_varconst2: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmulhw %xmm1, %xmm2 +; X86-NEXT: pmullw %xmm1, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_varconst2: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmulhw %xmm1, %xmm2 +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -792,25 +1240,45 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi16_varconst3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 -; CHECK-NEXT: movq %rcx, %xmm1 -; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-NEXT: psllq $32, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_varconst3: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: psllq $32, %xmm0 +; X86-NEXT: paddq %xmm2, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_varconst3: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-NEXT: movq %rcx, %xmm1 +; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: psrlq $32, %xmm0 +; X64-NEXT: pmuludq %xmm1, %xmm0 +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: paddq %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -830,25 +1298,45 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { -; CHECK-LABEL: mul_2xi16_varconst4: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 -; CHECK-NEXT: movq %rcx, %xmm1 -; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-NEXT: psllq $32, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) -; CHECK-NEXT: retq +; X86-LABEL: mul_2xi16_varconst4: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl c, %edx +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: psllq $32, %xmm0 +; X86-NEXT: paddq %xmm2, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-NEXT: retl +; +; X64-LABEL: mul_2xi16_varconst4: +; X64: # BB#0: # %entry +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-NEXT: psrad $16, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-NEXT: movq %rcx, %xmm1 +; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: psrlq $32, %xmm0 +; X64-NEXT: pmuludq %xmm1, %xmm0 +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: paddq %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index -- 2.40.0