--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
+
+; FIXME: Should still like to vectorize the memory operations for VI
+
+; Simple 3-pair chain with loads and stores
+; GCN-LABEL: @test1_as_3_3_3_v2f16(
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %i1 = load half, half addrspace(3)* %b, align 2
+ %mul = fmul half %i0, %i1
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+ %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+ %mul5 = fmul half %i3, %i4
+ store half %mul, half addrspace(3)* %c, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ store half %mul5, half addrspace(3)* %arrayidx5, align 2
+ ret void
+}
+
+; GCN-LABEL: @test1_as_3_0_0(
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %i1 = load half, half* %b, align 2
+ %mul = fmul half %i0, %i1
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+ %i4 = load half, half* %arrayidx4, align 2
+ %mul5 = fmul half %i3, %i4
+ store half %mul, half* %c, align 2
+ %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
+ store half %mul5, half* %arrayidx5, align 2
+ ret void
+}
+
+; GCN-LABEL: @test1_as_0_0_3_v2f16(
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
+ %i0 = load half, half* %a, align 2
+ %i1 = load half, half* %b, align 2
+ %mul = fmul half %i0, %i1
+ %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
+ %i3 = load half, half* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+ %i4 = load half, half* %arrayidx4, align 2
+ %mul5 = fmul half %i3, %i4
+ store half %mul, half addrspace(3)* %c, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ store half %mul5, half addrspace(3)* %arrayidx5, align 2
+ ret void
+}
+
+; GCN-LABEL: @test1_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %i1 = load half, half addrspace(3)* %b, align 2
+ %i2 = load half, half addrspace(3)* %c, align 2
+ %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+ %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+ %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+ store half %fma0, half addrspace(3)* %d, align 2
+ %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+ store half %fma1, half addrspace(3)* %arrayidx6, align 2
+ ret void
+}
+
+; GCN-LABEL: @mul_scalar_v2f16(
+; GFX9: load <2 x half>
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half>
+define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %mul = fmul half %i0, %scalar
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %mul5 = fmul half %i3, %scalar
+ store half %mul, half addrspace(3)* %c, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ store half %mul5, half addrspace(3)* %arrayidx5, align 2
+ ret void
+}
+
+; GCN-LABEL: @fabs_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %fabs0 = call half @llvm.fabs.f16(half %i0)
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %fabs1 = call half @llvm.fabs.f16(half %i3)
+ store half %fabs0, half addrspace(3)* %c, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ store half %fabs1, half addrspace(3)* %arrayidx5, align 2
+ ret void
+}
+
+; GCN-LABEL: @test1_fabs_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %i1 = load half, half addrspace(3)* %b, align 2
+ %i2 = load half, half addrspace(3)* %c, align 2
+ %i0.fabs = call half @llvm.fabs.f16(half %i0)
+
+ %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+ %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+ %i3.fabs = call half @llvm.fabs.f16(half %i3)
+
+ %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
+ store half %fma0, half addrspace(3)* %d, align 2
+ %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+ store half %fma1, half addrspace(3)* %arrayidx6, align 2
+ ret void
+}
+
+; FIXME: Should do vector load and extract component for fabs
+; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
+; GFX9: load half
+; GFX9: call half @llvm.fabs.f16(
+; GFX9: load <2 x half>
+; GFX9: load half
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+ %i0 = load half, half addrspace(3)* %a, align 2
+ %i1 = load half, half addrspace(3)* %b, align 2
+ %i2 = load half, half addrspace(3)* %c, align 2
+ %i1.fabs = call half @llvm.fabs.f16(half %i1)
+
+ %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
+ %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+ %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+ %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+ %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+ %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+ %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+ store half %fma0, half addrspace(3)* %d, align 2
+ %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+ store half %fma1, half addrspace(3)* %arrayidx6, align 2
+ ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.fma.f16(half, half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+++ /dev/null
-; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
-; XFAIL: *
-;
-; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
-; has vector registers of the expected width.
-; Currently, it says there are 8 vector registers that are 32-bits wide.
-
-target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
-
-
-; Simple 3-pair chain with loads and stores
-define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
-; CHECK-LABEL: @test1_as_3_3_3(
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
-; CHECK: ret
- %i0 = load double, double addrspace(3)* %a, align 8
- %i1 = load double, double addrspace(3)* %b, align 8
- %mul = fmul double %i0, %i1
- %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
- %i3 = load double, double addrspace(3)* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double addrspace(3)* %b, i64 1
- %i4 = load double, double addrspace(3)* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- store double %mul, double addrspace(3)* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
- store double %mul5, double addrspace(3)* %arrayidx5, align 8
- ret void
-}
-
-define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
-; CHECK-LABEL: @test1_as_3_0_0(
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: store <2 x double> %{{.*}}, <2 x double>* %
-; CHECK: ret
- %i0 = load double, double addrspace(3)* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
- %i3 = load double, double addrspace(3)* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- store double %mul, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %mul5, double* %arrayidx5, align 8
- ret void
-}
-
-define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
-; CHECK-LABEL: @test1_as_0_0_3(
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
-; CHECK: ret
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- store double %mul, double addrspace(3)* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
- store double %mul5, double addrspace(3)* %arrayidx5, align 8
- ret void
-}