""""""""""
The first argument to this intrinsic is a scalar accumulator value, which is
only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used.
+when fast-math flags are used. The type of the accumulator matches the
+element-type of the vector input.
The second argument must be a vector of floating-point values.
""""""""""
The first argument to this intrinsic is a scalar accumulator value, which is
only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used.
+when fast-math flags are used. The type of the accumulator matches the
+element-type of the vector input.
The second argument must be a vector of floating-point values.
//===------------------------ Reduction Intrinsics ------------------------===//
//
def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
- [llvm_anyfloat_ty,
+ [LLVMMatchType<0>,
llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
- [llvm_anyfloat_ty,
+ [LLVMMatchType<0>,
llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
- Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
- Src->getType()};
+ Type *Tys[] = {Acc->getType(), Src->getType()};
auto Decl = Intrinsic::getDeclaration(
M, Intrinsic::experimental_vector_reduce_fadd, Tys);
return createCallHelper(Decl, Ops, this);
CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
- Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
- Src->getType()};
+ Type *Tys[] = {Acc->getType(), Src->getType()};
auto Decl = Intrinsic::getDeclaration(
M, Intrinsic::experimental_vector_reduce_fmul, Tys);
return createCallHelper(Decl, Ops, this);
--- /dev/null
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64
+define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
+ %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+ ret float %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64
+define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
+ %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+ ret double %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64
+define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
+ %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+ ret <2 x double> %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64
+define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
+ %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+ ret double %res
+}
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
; CHECK-LABEL: add_HalfS:
; CHECK: faddp s0, v0.2s
; CHECK-NEXT: ret
- %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float> undef, <2 x float> %bin.rdx)
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx)
ret float %r
}
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
; CHECKNOFP16: ret
- %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half> undef, <4 x half> %bin.rdx)
+ %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx)
ret half %r
}
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
; CHECKNOFP16: ret
- %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half> undef, <8 x half> %bin.rdx)
+ %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx)
ret half %r
}
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
; CHECK-NEXT: faddp s0, v0.2s
; CHECK-NEXT: ret
- %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float> undef, <4 x float> %bin.rdx)
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx)
ret float %r
}
; CHECK-LABEL: add_D:
; CHECK: faddp d0, v0.2d
; CHECK-NEXT: ret
- %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double> undef, <2 x double> %bin.rdx)
+ %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx)
ret double %r
}
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
; CHECKNOFP16: ret
- %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half> undef, <16 x half> %bin.rdx)
+ %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx)
ret half %r
}
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
; CHECK-NEXT: faddp s0, v0.2s
; CHECK-NEXT: ret
- %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float> undef, <8 x float> %bin.rdx)
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx)
ret float %r
}
; CHECK: fadd v0.2d, v0.2d, v1.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: ret
- %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double> undef, <4 x double> %bin.rdx)
+ %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx)
ret double %r
}
; Function Attrs: nounwind readnone
-declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half>, <4 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half>, <8 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half>, <16 x half>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float>, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float>, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float>, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double>, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double>, <4 x double>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
; Repeat tests from general reductions to verify output for hoppy targets:
; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; AVX-FAST-NEXT: retq
- %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
ret float %r
}
; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; AVX-FAST-NEXT: retq
- %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
ret double %r
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
ret double %1
}
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
ret float %1
}
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
ret double %1
}
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+ %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+ %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
ret double %1
}
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
ret float %1
}
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+ %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
ret float %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
ret double %1
}
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
ret double %1
}
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+ %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
ret double %1
}
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)