From: Roman Lebedev Date: Sat, 20 Jul 2019 19:25:44 +0000 (+0000) Subject: [NFC][Codegen][X86][AArch64] Add "(x s% C) == 0" tests X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cc1879b272cd6ee6d906a8411b599c0efc013ceb;p=llvm [NFC][Codegen][X86][AArch64] Add "(x s% C) == 0" tests Much like with `urem`, the same optimization (albeit with slightly different algorithm) applies for the signed case, too. I'm simply copying the test coverage from `urem` case for now, i believe it should be (close to?) sufficient. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366640 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/AArch64/srem-seteq-optsize.ll b/test/CodeGen/AArch64/srem-seteq-optsize.ll new file mode 100644 index 00000000000..4d8d46a1814 --- /dev/null +++ b/test/CodeGen/AArch64/srem-seteq-optsize.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { +; CHECK-LABEL: test_minsize: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: sdiv w8, w0, w8 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: mov w9, #-10 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: ret + %rem = srem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i32 @test_optsize(i32 %X) optsize nounwind readnone { +; CHECK-LABEL: test_optsize: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #26215 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x10, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w10 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: mov w9, #-10 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: ret + %rem = srem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} diff --git a/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll new file mode 100644 index 00000000000..6a9a9225183 --- /dev/null +++ b/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -0,0 +1,802 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; At the moment, BuildSREMEqFold does not handle nonsplat vectors. + +; Odd+Even divisors +define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;==============================================================================; + +; One all-ones divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: adrp x8, .LCPI1_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: adrp x8, .LCPI2_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in even divisor +define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: adrp x8, .LCPI3_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: adrp x8, .LCPI4_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: adrp x8, .LCPI5_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x8, .LCPI6_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] +; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: adrp x8, .LCPI6_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor in odd divisor +define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: adrp x8, .LCPI7_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] +; CHECK-NEXT: adrp x8, .LCPI7_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor in even divisor +define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: adrp x8, .LCPI8_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshr v3.4s, v1.4s, #3 +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: adrp x8, .LCPI9_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] +; CHECK-NEXT: adrp x8, .LCPI9_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One one divisor in odd divisor +define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: adrp x8, .LCPI10_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_1] +; CHECK-NEXT: adrp x8, .LCPI10_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2] +; CHECK-NEXT: adrp x8, .LCPI10_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_3] +; CHECK-NEXT: adrp x8, .LCPI10_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One one divisor in even divisor +define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] +; CHECK-NEXT: adrp x8, .LCPI11_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: adrp x8, .LCPI11_3 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: adrp x8, .LCPI12_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] +; CHECK-NEXT: adrp x8, .LCPI12_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: adrp x8, .LCPI12_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;==============================================================================; + +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: adrp x8, .LCPI13_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] +; CHECK-NEXT: adrp x8, .LCPI13_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] +; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_3] +; CHECK-NEXT: adrp x8, .LCPI13_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: adrp x8, .LCPI14_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] +; CHECK-NEXT: adrp x8, .LCPI14_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_3] +; CHECK-NEXT: adrp x8, .LCPI14_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: adrp x8, .LCPI15_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-NEXT: adrp x8, .LCPI15_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_3] +; CHECK-NEXT: adrp x8, .LCPI15_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: adrp x8, .LCPI16_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: adrp x8, .LCPI16_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: adrp x8, .LCPI17_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] +; CHECK-NEXT: adrp x8, .LCPI17_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: adrp x8, .LCPI17_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-NEXT: adrp x8, .LCPI18_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: adrp x8, .LCPI18_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: adrp x8, .LCPI19_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_2] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: adrp x8, .LCPI19_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: adrp x8, .LCPI20_3 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: adrp x8, .LCPI21_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] +; CHECK-NEXT: adrp x8, .LCPI21_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] +; CHECK-NEXT: adrp x8, .LCPI21_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: adrp x8, .LCPI21_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: adrp x8, .LCPI22_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] +; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: neg v4.4s, v4.4s +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff +; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: adrp x8, .LCPI23_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: neg v4.4s, v4.4s +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff +; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/test/CodeGen/AArch64/srem-seteq-vec-splat.ll new file mode 100644 index 00000000000..da260fd95b5 --- /dev/null +++ b/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; Odd divisor +define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_25: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v2.4s, #3 +; CHECK-NEXT: movi v1.4s, #25 +; CHECK-NEXT: usra v3.4s, v2.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_100: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v2.4s, #5 +; CHECK-NEXT: movi v1.4s, #100 +; CHECK-NEXT: usra v3.4s, v2.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; +; Comparison constant has undef elements. +;------------------------------------------------------------------------------; + +define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v2.4s, #3 +; CHECK-NEXT: movi v1.4s, #25 +; CHECK-NEXT: usra v3.4s, v2.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v2.4s, #5 +; CHECK-NEXT: movi v1.4s, #100 +; CHECK-NEXT: usra v3.4s, v2.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_pow2: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: usra v2.4s, v1.4s, #28 +; CHECK-NEXT: bic v2.4s, #15 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; We could lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_allones: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; If all divisors are ones, this is constant-folded. +define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_one_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_one_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/AArch64/srem-seteq.ll b/test/CodeGen/AArch64/srem-seteq.ll new file mode 100644 index 00000000000..f9acec99c83 --- /dev/null +++ b/test/CodeGen/AArch64/srem-seteq.ll @@ -0,0 +1,253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +;------------------------------------------------------------------------------; +; Odd divisors +;------------------------------------------------------------------------------; + +define i32 @test_srem_odd(i32 %X) nounwind { +; CHECK-LABEL: test_srem_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #26215 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 5 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +define i32 @test_srem_odd_25(i32 %X) nounwind { +; CHECK-LABEL: test_srem_odd_25: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #35 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #25 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 25 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 30 set. +define i32 @test_srem_odd_bit30(i32 %X) nounwind { +; CHECK-LABEL: test_srem_odd_bit30: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: sbfiz x9, x0, #29, #32 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #59 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 1073741827 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 31 set. +define i32 @test_srem_odd_bit31(i32 %X) nounwind { +; CHECK-LABEL: test_srem_odd_bit31: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: add x8, x8, x8, lsl #29 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #60 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #-2147483645 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 2147483651 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Even divisors +;------------------------------------------------------------------------------; + +define i16 @test_srem_even(i16 %X) nounwind { +; CHECK-LABEL: test_srem_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #9363 +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: movk w9, #37449, lsl #16 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: asr w10, w9, #3 +; CHECK-NEXT: add w9, w10, w9, lsr #31 +; CHECK-NEXT: mov w10, #14 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %srem = srem i16 %X, 14 + %cmp = icmp ne i16 %srem, 0 + %ret = zext i1 %cmp to i16 + ret i16 %ret +} + +define i32 @test_srem_even_100(i32 %X) nounwind { +; CHECK-LABEL: test_srem_even_100: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #37 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #100 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 100 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_even, except the divisor has bit 30 set. +define i32 @test_srem_even_bit30(i32 %X) nounwind { +; CHECK-LABEL: test_srem_even_bit30: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #65433 +; CHECK-NEXT: movk w8, #16383, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #60 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #104 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 1073741928 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 31 set. +define i32 @test_srem_even_bit31(i32 %X) nounwind { +; CHECK-LABEL: test_srem_even_bit31: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #65433 +; CHECK-NEXT: movk w8, #32767, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #30 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #102 +; CHECK-NEXT: movk w9, #32768, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 2147483750 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Special case +;------------------------------------------------------------------------------; + +; 'NE' predicate is fine too. +define i32 @test_srem_odd_setne(i32 %X) nounwind { +; CHECK-LABEL: test_srem_odd_setne: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #26215 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %srem = srem i32 %X, 5 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define i32 @test_srem_one(i32 %X) nounwind { +; CHECK-LABEL: test_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: ret + %srem = srem i32 %X, 1 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_srem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_srem_allones: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w0, w0, lt +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 4294967295 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_srem_pow2(i32 %X) nounwind { +; CHECK-LABEL: test_srem_pow2: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #15 // =15 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xfffffff0 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 16 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} diff --git a/test/CodeGen/X86/srem-seteq-optsize.ll b/test/CodeGen/X86/srem-seteq-optsize.ll new file mode 100644 index 00000000000..a48e9d23fda --- /dev/null +++ b/test/CodeGen/X86/srem-seteq-optsize.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64 + +; On X86, division in expensive. BuildRemEqFold should therefore run even +; when optimizing for size. Only optimizing for minimum size retains a plain div. + +define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { +; X86-LABEL: test_minsize: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl $5 +; X86-NEXT: popl %ecx +; X86-NEXT: cltd +; X86-NEXT: idivl %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB0_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: pushl $-10 +; X86-NEXT: popl %eax +; X86-NEXT: retl +; X86-NEXT: .LBB0_1: +; X86-NEXT: pushl $42 +; X86-NEXT: popl %eax +; X86-NEXT: retl +; +; X64-LABEL: test_minsize: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: pushq $5 +; X64-NEXT: popq %rcx +; X64-NEXT: cltd +; X64-NEXT: idivl %ecx +; X64-NEXT: testl %edx, %edx +; X64-NEXT: pushq $42 +; X64-NEXT: popq %rcx +; X64-NEXT: pushq $-10 +; X64-NEXT: popq %rax +; X64-NEXT: cmovel %ecx, %eax +; X64-NEXT: retq + %rem = srem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i32 @test_optsize(i32 %X) optsize nounwind readnone { +; X86-LABEL: test_optsize: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl $42, %eax +; X86-NEXT: je .LBB1_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl $-10, %eax +; X86-NEXT: .LBB1_2: +; X86-NEXT: retl +; +; X64-LABEL: test_optsize: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq $1717986919, %rax, %rcx # imm = 0x66666667 +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rcx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: movl $42, %ecx +; X64-NEXT: movl $-10, %eax +; X64-NEXT: cmovel %ecx, %eax +; X64-NEXT: retq + %rem = srem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} diff --git a/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll new file mode 100644 index 00000000000..6e51b4fcacc --- /dev/null +++ b/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -0,0 +1,3535 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE41 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX1 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL + +; Odd+Even divisors +define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,0,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,25,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,0,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;==============================================================================; + +; One all-ones divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_allones_eq: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_allones_eq: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_allones_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_allones_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_allones_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_allones_ne: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_allones_ne: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in even divisor +define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_allones_eq: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_allones_eq: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_allones_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_allones_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_allones_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_allones_ne: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm6 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm6 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm3, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_allones_ne: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor in odd divisor +define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <0,u,1,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor in even divisor +define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One one divisor in odd divisor +define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,1,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One one divisor in even divisor +define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;==============================================================================; + +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE41-NEXT: psrad $3, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_allones_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1717986919,0,0,1717986919] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_allones_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_allones_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $3, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrlq $32, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $1, %xmm4 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm6, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 +; CHECK-SSE2-NEXT: psrlq $32, %xmm6 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm6 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $31, %xmm6 +; CHECK-SSE2-NEXT: pand %xmm3, %xmm6 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/X86/srem-seteq-vec-splat.ll b/test/CodeGen/X86/srem-seteq-vec-splat.ll new file mode 100644 index 00000000000..843a26998ad --- /dev/null +++ b/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -0,0 +1,586 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE41 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX1 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL + +; Odd divisor +define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_25: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_25: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_25: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_25: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_25: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_100: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_100: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_100: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_100: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_100: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; +; Comparison constant has undef elements. +;------------------------------------------------------------------------------; + +define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_undef1: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_undef1: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_undef1: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_undef1: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_undef1: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_undef1: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_undef1: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_undef1: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_undef1: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_undef1: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_pow2: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE-NEXT: psrad $31, %xmm1 +; CHECK-SSE-NEXT: psrld $28, %xmm1 +; CHECK-SSE-NEXT: paddd %xmm0, %xmm1 +; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-SSE-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_pow2: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_pow2: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $28, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967280,4294967280,4294967280,4294967280] +; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_pow2: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $28, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; We could lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_allones: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_allones: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_allones: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_allones: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; If all divisors are ones, this is constant-folded. +define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_one_eq: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_one_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_one_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_one_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_one_ne: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test_srem_one_ne: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq + %srem = srem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %srem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/X86/srem-seteq.ll b/test/CodeGen/X86/srem-seteq.ll new file mode 100644 index 00000000000..0ae07a51586 --- /dev/null +++ b/test/CodeGen/X86/srem-seteq.ll @@ -0,0 +1,420 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64 + +;------------------------------------------------------------------------------; +; Odd divisors +;------------------------------------------------------------------------------; + +define i32 @test_srem_odd(i32 %X) nounwind { +; X86-LABEL: test_srem_odd: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_odd: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 5 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +define i32 @test_srem_odd_25(i32 %X) nounwind { +; X86-LABEL: test_srem_odd_25: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_odd_25: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $35, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 25 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 30 set. +define i32 @test_srem_odd_bit30(i32 %X) nounwind { +; X86-LABEL: test_srem_odd_bit30: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $536870911, %edx # imm = 0x1FFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $27, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_odd_bit30: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: subq %rcx, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $59, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741827, %eax, %edx # imm = 0x40000003 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 1073741827 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 31 set. +define i32 @test_srem_odd_bit31(i32 %X) nounwind { +; X86-LABEL: test_srem_odd_bit31: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-536870913, %edx # imm = 0xDFFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_odd_bit31: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483645, %eax, %edx # imm = 0x80000003 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 2147483651 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Even divisors +;------------------------------------------------------------------------------; + +define i16 @test_srem_even(i16 %X) nounwind { +; X86-LABEL: test_srem_even: +; X86: # %bb.0: +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $18725, %ecx, %eax # imm = 0x4925 +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $31, %edx +; X86-NEXT: sarl $18, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll $4, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: setne %al +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: test_srem_even: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: imull $18725, %ecx, %eax # imm = 0x4925 +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $18, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shll $4, %edx +; X64-NEXT: subl %eax, %edx +; X64-NEXT: subl %eax, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: setne %al +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %srem = srem i16 %X, 14 + %cmp = icmp ne i16 %srem, 0 + %ret = zext i1 %cmp to i16 + ret i16 %ret +} + +define i32 @test_srem_even_100(i32 %X) nounwind { +; X86-LABEL: test_srem_even_100: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $5, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $100, %edx, %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_even_100: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $37, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $100, %eax, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 100 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_even, except the divisor has bit 30 set. +define i32 @test_srem_even_bit30(i32 %X) nounwind { +; X86-LABEL: test_srem_even_bit30: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1073741721, %edx # imm = 0x3FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_even_bit30: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1073741721, %rcx, %rax # imm = 0x3FFFFF99 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741928, %eax, %edx # imm = 0x40000068 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 1073741928 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 31 set. +define i32 @test_srem_even_bit31(i32 %X) nounwind { +; X86-LABEL: test_srem_even_bit31: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2147483545, %edx # imm = 0x7FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $30, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_even_bit31: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $2147483545, %rcx, %rax # imm = 0x7FFFFF99 +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $30, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483546, %eax, %edx # imm = 0x80000066 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 2147483750 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Special case +;------------------------------------------------------------------------------; + +; 'NE' predicate is fine too. +define i32 @test_srem_odd_setne(i32 %X) nounwind { +; X86-LABEL: test_srem_odd_setne: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_odd_setne: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al +; X64-NEXT: retq + %srem = srem i32 %X, 5 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define i32 @test_srem_one(i32 %X) nounwind { +; CHECK-LABEL: test_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: ret{{[l|q]}} + %srem = srem i32 %X, 1 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_srem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_srem_allones: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: ret{{[l|q]}} + %srem = srem i32 %X, 4294967295 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_srem_pow2(i32 %X) nounwind { +; X86-LABEL: test_srem_pow2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: shrl $28, %edx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: andl $-16, %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_pow2: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: sarl $31, %ecx +; X64-NEXT: shrl $28, %ecx +; X64-NEXT: addl %edi, %ecx +; X64-NEXT: andl $-16, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 16 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +}