;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
-declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata)
declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
declare float @llvm.sqrt.f32(float)
declare void @llvm.s390.sfpc(i32)
-; For non-strict operations, we expect the post-RA scheduler to
-; separate the two square root instructions on z13.
-define void @f1(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
+; The basic assumption of all following tests is that on z13, we never
+; want to see two square root instructions directly in a row, so the
+; post-RA scheduler will always schedule something else in between
+; whenever possible.
+
+; We can move any FP operation across a (normal) store.
+
+define void @f1(float %f1, float %f2, float *%ptr1, float *%ptr2) {
; CHECK-LABEL: f1:
; CHECK: sqebr
-; CHECK: {{aebr|sebr}}
+; CHECK: ste
; CHECK: sqebr
+; CHECK: ste
; CHECK: br %r14
- %add = fadd float %f1, %f2
- %sub = fsub float %f3, %f4
- %sqrt1 = call float @llvm.sqrt.f32(float %f2)
- %sqrt2 = call float @llvm.sqrt.f32(float %f4)
-
- %ptr1 = getelementptr float, float *%ptr0, i64 1
- %ptr2 = getelementptr float, float *%ptr0, i64 2
- %ptr3 = getelementptr float, float *%ptr0, i64 3
+ %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+ %sqrt2 = call float @llvm.sqrt.f32(float %f2)
- store float %add, float *%ptr0
- store float %sub, float *%ptr1
- store float %sqrt1, float *%ptr2
- store float %sqrt2, float *%ptr3
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
ret void
}
-; But for strict operations, this must not happen.
-define void @f2(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
+define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) {
; CHECK-LABEL: f2:
-; CHECK: {{aebr|sebr}}
-; CHECK: {{aebr|sebr}}
; CHECK: sqebr
+; CHECK: ste
; CHECK: sqebr
+; CHECK: ste
; CHECK: br %r14
- %add = call float @llvm.experimental.constrained.fadd.f32(
- float %f1, float %f2,
+ %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f1,
metadata !"round.dynamic",
- metadata !"fpexcept.strict")
- %sub = call float @llvm.experimental.constrained.fsub.f32(
- float %f3, float %f4,
+ metadata !"fpexcept.ignore")
+ %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f2,
metadata !"round.dynamic",
- metadata !"fpexcept.strict")
+ metadata !"fpexcept.ignore")
+
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
+
+ ret void
+}
+
+define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: br %r14
+
%sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
- float %f2,
+ float %f1,
metadata !"round.dynamic",
metadata !"fpexcept.strict")
%sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
- float %f4,
+ float %f2,
metadata !"round.dynamic",
metadata !"fpexcept.strict")
- %ptr1 = getelementptr float, float *%ptr0, i64 1
- %ptr2 = getelementptr float, float *%ptr0, i64 2
- %ptr3 = getelementptr float, float *%ptr0, i64 3
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
- store float %add, float *%ptr0
- store float %sub, float *%ptr1
- store float %sqrt1, float *%ptr2
- store float %sqrt2, float *%ptr3
+ ret void
+}
+
+
+; We can move a non-strict FP operation or a fpexcept.ignore
+; operation even across a volatile store, but not a fpexcept.strict
+; operation.
+
+define void @f4(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f4:
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: br %r14
+
+ %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+ %sqrt2 = call float @llvm.sqrt.f32(float %f2)
+
+ store volatile float %sqrt1, float *%ptr1
+ store volatile float %sqrt2, float *%ptr2
ret void
}
-; On the other hand, strict operations that use the fpexcept.ignore
-; exception behaviour should be scheduled freely.
-define void @f3(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
-; CHECK-LABEL: f3:
+define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f5:
; CHECK: sqebr
-; CHECK: {{aebr|sebr}}
+; CHECK: ste
; CHECK: sqebr
+; CHECK: ste
; CHECK: br %r14
- %add = call float @llvm.experimental.constrained.fadd.f32(
- float %f1, float %f2,
+ %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f1,
metadata !"round.dynamic",
metadata !"fpexcept.ignore")
- %sub = call float @llvm.experimental.constrained.fsub.f32(
- float %f3, float %f4,
+ %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f2,
metadata !"round.dynamic",
metadata !"fpexcept.ignore")
+
+ store volatile float %sqrt1, float *%ptr1
+ store volatile float %sqrt2, float *%ptr2
+
+ ret void
+}
+
+define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f6:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
%sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+ %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
float %f2,
metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+
+ store volatile float %sqrt1, float *%ptr1
+ store volatile float %sqrt2, float *%ptr2
+
+ ret void
+}
+
+
+; No variant of FP operations can be scheduled across a SPFC.
+
+define void @f7(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f7:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
+ %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+ %sqrt2 = call float @llvm.sqrt.f32(float %f2)
+
+ call void @llvm.s390.sfpc(i32 0)
+
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
+
+ ret void
+}
+
+define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f8:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
+ %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f1,
+ metadata !"round.dynamic",
metadata !"fpexcept.ignore")
%sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
- float %f4,
+ float %f2,
metadata !"round.dynamic",
metadata !"fpexcept.ignore")
- %ptr1 = getelementptr float, float *%ptr0, i64 1
- %ptr2 = getelementptr float, float *%ptr0, i64 2
- %ptr3 = getelementptr float, float *%ptr0, i64 3
+ call void @llvm.s390.sfpc(i32 0)
- store float %add, float *%ptr0
- store float %sub, float *%ptr1
- store float %sqrt1, float *%ptr2
- store float %sqrt2, float *%ptr3
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
ret void
}
-; However, even non-strict operations must not be scheduled across an SFPC.
-define void @f4(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
-; CHECK-LABEL: f4:
-; CHECK: {{aebr|sebr}}
-; CHECK: {{aebr|sebr}}
-; CHECK: sfpc
+define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f9:
; CHECK: sqebr
; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
; CHECK: br %r14
- %add = fadd float %f1, %f2
- %sub = fsub float %f3, %f4
- call void @llvm.s390.sfpc(i32 0)
- %sqrt1 = call float @llvm.sqrt.f32(float %f2)
- %sqrt2 = call float @llvm.sqrt.f32(float %f4)
+ %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+ %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+ float %f2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
- %ptr1 = getelementptr float, float *%ptr0, i64 1
- %ptr2 = getelementptr float, float *%ptr0, i64 2
- %ptr3 = getelementptr float, float *%ptr0, i64 3
+ call void @llvm.s390.sfpc(i32 0)
- store float %add, float *%ptr0
- store float %sub, float *%ptr1
- store float %sqrt1, float *%ptr2
- store float %sqrt2, float *%ptr3
+ store float %sqrt1, float *%ptr1
+ store float %sqrt2, float *%ptr2
ret void
}
; S390X-NEXT: ldeb %f3, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI3_2
; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: ddb %f2, 0(%r2)
; S390X-NEXT: ddbr %f3, %f1
+; S390X-NEXT: ddb %f2, 0(%r2)
; S390X-NEXT: ddbr %f4, %f0
; S390X-NEXT: std %f4, 16(%r2)
; S390X-NEXT: std %f3, 8(%r2)
define void @constrained_vector_fmul_v3f64(<3 x double>* %a) {
; S390X-LABEL: constrained_vector_fmul_v3f64:
; S390X: # %bb.0: # %entry
+; S390X-NEXT: ld %f0, 8(%r2)
; S390X-NEXT: larl %r1, .LCPI13_0
-; S390X-NEXT: ld %f0, 0(%r1)
-; S390X-NEXT: ld %f1, 8(%r2)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ld %f2, 16(%r2)
-; S390X-NEXT: ldr %f3, %f0
+; S390X-NEXT: mdbr %f0, %f1
+; S390X-NEXT: ldr %f3, %f1
; S390X-NEXT: mdb %f3, 0(%r2)
-; S390X-NEXT: mdbr %f1, %f0
-; S390X-NEXT: mdbr %f2, %f0
+; S390X-NEXT: mdbr %f2, %f1
; S390X-NEXT: std %f2, 16(%r2)
-; S390X-NEXT: std %f1, 8(%r2)
+; S390X-NEXT: std %f0, 8(%r2)
; S390X-NEXT: std %f3, 0(%r2)
; S390X-NEXT: br %r14
;
define void @constrained_vector_fadd_v3f64(<3 x double>* %a) {
; S390X-LABEL: constrained_vector_fadd_v3f64:
; S390X: # %bb.0: # %entry
+; S390X-NEXT: ld %f0, 8(%r2)
; S390X-NEXT: larl %r1, .LCPI18_0
-; S390X-NEXT: ld %f0, 0(%r1)
-; S390X-NEXT: ld %f1, 8(%r2)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ld %f2, 16(%r2)
-; S390X-NEXT: ldr %f3, %f0
+; S390X-NEXT: adbr %f0, %f1
+; S390X-NEXT: ldr %f3, %f1
; S390X-NEXT: adb %f3, 0(%r2)
-; S390X-NEXT: adbr %f1, %f0
-; S390X-NEXT: adbr %f2, %f0
+; S390X-NEXT: adbr %f2, %f1
; S390X-NEXT: std %f2, 16(%r2)
-; S390X-NEXT: std %f1, 8(%r2)
+; S390X-NEXT: std %f0, 8(%r2)
; S390X-NEXT: std %f3, 0(%r2)
; S390X-NEXT: br %r14
;
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI22_0
; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: lzer %f1
; S390X-NEXT: ler %f4, %f0
-; S390X-NEXT: sebr %f4, %f1
; S390X-NEXT: larl %r1, .LCPI22_1
; S390X-NEXT: ler %f2, %f0
; S390X-NEXT: seb %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI22_2
; S390X-NEXT: seb %f0, 0(%r1)
+; S390X-NEXT: lzer %f1
+; S390X-NEXT: sebr %f4, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fsub_v3f32: