From: Roman Lebedev Date: Fri, 28 Jun 2019 11:36:34 +0000 (+0000) Subject: [NFC][Codegen] Revisit test coverage for X % C == 0 fold X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bae9441dbd052da4cb1655f06ad23030a16b22e7;p=llvm [NFC][Codegen] Revisit test coverage for X % C == 0 fold git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364642 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index 5fef04eb744..759ccf307f8 100644 --- a/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -3,40 +3,39 @@ ; At the moment, BuildUREMEqFold does not handle nonsplat vectors. -define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_odd_div: +; Odd+Even divisors +define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v3.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v3.2s, v2.2s ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_even_div: +; One all-ones divisor in odd divisor +define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] @@ -44,29 +43,25 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] ; CHECK-NEXT: adrp x8, .LCPI1_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI1_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Can't fold due to last line -define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_pow2: +; One all-ones divisor in even divisor +define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] @@ -74,25 +69,29 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] ; CHECK-NEXT: adrp x8, .LCPI2_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_3] ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Can't fold due to second line -define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_one: +; One all-ones divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] @@ -106,25 +105,23 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s ; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: adrp x8, .LCPI3_4 ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_nomulinv: +; One power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] @@ -132,52 +129,55 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] ; CHECK-NEXT: adrp x8, .LCPI4_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_comp(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_comp: +; One power-of-two divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_poweroftwo: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI5_0 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI5_0] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v4.4s -; CHECK-NEXT: movi v1.4s, #5 -; CHECK-NEXT: ushr v2.4s, v2.4s, #2 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v3.4s +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_both: +; One power-of-two divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] @@ -185,82 +185,194 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3] ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s ; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_div_undef: +; One all-ones divisor and one power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: adrp x8, .LCPI7_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] +; CHECK-NEXT: adrp x8, .LCPI7_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_comp_undef: +; One all-ones divisor and one power-of-two divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #5 -; CHECK-NEXT: ushr v2.4s, v2.4s, #2 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: adrp x8, .LCPI8_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] +; CHECK-NEXT: adrp x8, .LCPI8_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_both_undef: +; One all-ones divisor and one power-of-two divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: adrp x8, .LCPI9_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] +; CHECK-NEXT: adrp x8, .LCPI9_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_div_even_odd(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_div_even_odd: +;------------------------------------------------------------------------------; +; Negative tests - the fold is invalid if any divisor is 1. +;------------------------------------------------------------------------------; + +; One divisor in odd divisor +define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: adrp x8, .LCPI10_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_1] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI10_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2] +; CHECK-NEXT: adrp x8, .LCPI10_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One divisor in even divisors +define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] +; CHECK-NEXT: adrp x8, .LCPI11_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] +; CHECK-NEXT: adrp x8, .LCPI11_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One divisor in odd-even divisors +define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: adrp x8, .LCPI12_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] +; CHECK-NEXT: adrp x8, .LCPI12_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret diff --git a/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/test/CodeGen/AArch64/urem-seteq-vec-splat.ll index 7621050631e..8b08f533239 100644 --- a/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -1,102 +1,105 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -; Tests BuildUREMEqFold for 4 x i32 splat vectors with odd divisor. -; See urem-seteq.ll for justification behind constants emitted. -define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_odd_vec_i32: +; Odd divisor +define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: movi v1.16b, #51 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: mov w9, #28835 +; CHECK-NEXT: movk w9, #2621, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Like test_urem_odd_vec_i32, but with 8 x i16 vectors. -define <8 x i16> @test_urem_odd_vec_i16(<8 x i16> %X) nounwind readnone { -; CHECK-LABEL: test_urem_odd_vec_i16: +; Even divisors +define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: movi v1.16b, #51 -; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h -; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h -; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #100 +; CHECK-NEXT: ushr v2.4s, v2.4s, #5 +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <8 x i16> %X, - %cmp = icmp eq <8 x i16> %urem, - %ret = zext <8 x i1> %cmp to <8 x i16> - ret <8 x i16> %ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret } -; Tests BuildUREMEqFold for 4 x i32 splat vectors with even divisor. -; The expected behavior is that the fold is _not_ applied -; because it requires a ROTR in the even case, which has to be expanded. -define <4 x i32> @test_urem_even_vec_i32(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_even_vec_i32: +;------------------------------------------------------------------------------; +; Comparison constant has undef elements. +;------------------------------------------------------------------------------; + +define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #9363 -; CHECK-NEXT: movk w8, #37449, lsl #16 -; CHECK-NEXT: ushr v1.4s, v0.4s, #1 -; CHECK-NEXT: dup v3.4s, w8 -; CHECK-NEXT: umull2 v4.2d, v1.4s, v3.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: movi v2.4s, #14 -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #25 +; CHECK-NEXT: ushr v2.4s, v2.4s, #3 +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Like test_urem_even_vec_i32, but with 8 x i16 vectors. -; i16 is not legal for ROTR on AArch64, but ROTR also cannot be promoted to i32, -; so this would crash if BuildUREMEqFold was applied. -define <8 x i16> @test_urem_even_vec_i16(<8 x i16> %X) nounwind readnone { -; CHECK-LABEL: test_urem_even_vec_i16: +define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #18725 -; CHECK-NEXT: ushr v1.8h, v0.8h, #1 -; CHECK-NEXT: dup v3.8h, w8 -; CHECK-NEXT: umull2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: umull v1.4s, v1.4h, v3.4h -; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h -; CHECK-NEXT: movi v2.8h, #14 -; CHECK-NEXT: ushr v1.8h, v1.8h, #1 -; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, #0 -; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #100 +; CHECK-NEXT: ushr v2.4s, v2.4s, #5 +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <8 x i16> %X, - %cmp = icmp eq <8 x i16> %urem, - %ret = zext <8 x i1> %cmp to <8 x i16> - ret <8 x i16> %ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret } -; We should not proceed with this fold if the divisor is 1 or -1 -define <4 x i32> @test_urem_one_vec(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_one_vec: +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ret @@ -106,10 +109,24 @@ define <4 x i32> @test_urem_one_vec(<4 x i32> %X) nounwind readnone { ret <4 x i32> %ret } -; BuildUREMEqFold does not work when the only odd factor of the divisor is 1. -; This ensures we don't touch powers of two. -define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone { -; CHECK-LABEL: test_urem_pow2_vec: +; We can lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_allones: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_pow2: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/test/CodeGen/AArch64/urem-seteq.ll b/test/CodeGen/AArch64/urem-seteq.ll index 406866c9546..164c0a5f1ec 100644 --- a/test/CodeGen/AArch64/urem-seteq.ll +++ b/test/CodeGen/AArch64/urem-seteq.ll @@ -1,11 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +;------------------------------------------------------------------------------; +; Odd divisors +;------------------------------------------------------------------------------; + ; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. ; The corresponding pseudocode is: ; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] ; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] -define i32 @test_urem_odd(i32 %X) nounwind readnone { +define i32 @test_urem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 @@ -22,8 +26,25 @@ define i32 @test_urem_odd(i32 %X) nounwind readnone { ret i32 %ret } +define i32 @test_urem_odd_25(i32 %X) nounwind { +; CHECK-LABEL: test_urem_odd_25: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: mov w9, #28836 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: movk w9, #2621, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %urem = urem i32 %X, 25 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ; This is like test_urem_odd, except the divisor has bit 30 set. -define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { +define i32 @test_urem_odd_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 @@ -39,7 +60,7 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { } ; This is like test_urem_odd, except the divisor has bit 31 set. -define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { +define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 @@ -54,13 +75,17 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { ret i32 %ret } +;------------------------------------------------------------------------------; +; Even divisors +;------------------------------------------------------------------------------; + ; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. ; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. ; The corresponding pseudocode is: ; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] ; Q <- [Q >>rot K] <=> [Q >>rot 1] ; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] -define i16 @test_urem_even(i16 %X) nounwind readnone { +define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w9, #28087 @@ -79,8 +104,26 @@ define i16 @test_urem_even(i16 %X) nounwind readnone { ret i16 %ret } +define i32 @test_urem_even_100(i32 %X) nounwind { +; CHECK-LABEL: test_urem_even_100: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: mov w9, #23593 +; CHECK-NEXT: ror w8, w8, #2 +; CHECK-NEXT: movk w9, #655, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %urem = urem i32 %X, 100 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ; This is like test_urem_even, except the divisor has bit 30 set. -define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { +define i32 @test_urem_even_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_urem_even_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #20165 @@ -97,7 +140,7 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { } ; This is like test_urem_odd, except the divisor has bit 31 set. -define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { +define i32 @test_urem_even_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_urem_even_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64251 @@ -113,8 +156,33 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { ret i32 %ret } -; We should not proceed with this fold if the divisor is 1 or -1 -define i32 @test_urem_one(i32 %X) nounwind readnone { +;------------------------------------------------------------------------------; +; Special case +;------------------------------------------------------------------------------; + +; 'NE' predicate is fine too. +define i32 @test_urem_odd_setne(i32 %X) nounwind { +; CHECK-LABEL: test_urem_odd_setne: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: movk w8, #52428, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret + %urem = urem i32 %X, 5 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define i32 @test_urem_one(i32 %X) nounwind { ; CHECK-LABEL: test_urem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w0, #1 @@ -125,28 +193,22 @@ define i32 @test_urem_one(i32 %X) nounwind readnone { ret i32 %ret } -define i32 @test_urem_100(i32 %X) nounwind readnone { -; CHECK-LABEL: test_urem_100: +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_urem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: mov w9, #23593 -; CHECK-NEXT: ror w8, w8, #2 -; CHECK-NEXT: movk w9, #655, lsl #16 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: cmp w8, #2 // =2 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret - %urem = urem i32 %X, 100 + %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret } -; We can lower remainder of division by powers of two much better elsewhere; -; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1. -; This ensures we don't touch powers of two. -define i32 @test_urem_pow2(i32 %X) nounwind readnone { +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_urem_pow2(i32 %X) nounwind { ; CHECK-LABEL: test_urem_pow2: ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0xf diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 92087da1a15..39a65c47a78 100644 --- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -7,36 +7,36 @@ ; At the moment, BuildUREMEqFold does not handle nonsplat vectors. -define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_odd_div: +; Odd+Even divisors +define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,613566757,954437177] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrlq $32, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: psrld $3, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [3,5,7,9] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,25,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -44,24 +44,245 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_div: +; CHECK-SSE41-LABEL: test_urem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,613566757,954437177] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_even: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_even: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_even: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in odd divisor +define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_allones: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_allones: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor in even divisor +define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_allones: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm2 ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psrlq $32, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -69,22 +290,18 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_div: +; CHECK-AVX1-LABEL: test_urem_even_allones: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,613566757,954437177] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -92,19 +309,15 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_div: +; CHECK-AVX2-LABEL: test_urem_even_allones: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,613566757,954437177] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -113,19 +326,15 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_div: +; CHECK-AVX512VL-LABEL: test_urem_even_allones: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,613566757,954437177] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -133,39 +342,41 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_even_div: +; One all-ones divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,4294967295,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 @@ -174,22 +385,26 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_even_div: +; CHECK-SSE41-LABEL: test_urem_odd_even_allones: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $3, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -197,20 +412,23 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_even_div: +; CHECK-AVX1-LABEL: test_urem_odd_even_allones: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -218,9 +436,9 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_even_div: +; CHECK-AVX2-LABEL: test_urem_odd_even_allones: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -236,9 +454,9 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_even_div: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -253,60 +471,49 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Can't fold due to last line -define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_pow2: +; One power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,268435456] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,16] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_pow2: +; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,268435456] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $3, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -314,20 +521,120 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_pow2: +; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,268435456] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -335,13 +642,13 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_pow2: +; CHECK-AVX2-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,268435456] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 @@ -352,13 +659,13 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_pow2: +; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,268435456] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 @@ -368,89 +675,90 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Can't fold due to second line -define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_one: +; One power-of-two divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,268435456,1374389535] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,2] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,16,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_one: +; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,0,2863311531,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,268435456,1374389535] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $3, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_one: +; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2863311531,0,2863311531,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,268435456,1374389535] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -458,9 +766,9 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_one: +; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -469,7 +777,6 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -477,9 +784,9 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_one: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -488,47 +795,162 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_nomulinv: +; One all-ones divisor and one power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,1374389535,2863311531,2454267027] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,4294967295,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one power-of-two divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $5, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,100,12,14] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,4294967295,14] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -536,13 +958,13 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_nomulinv: +; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,1374389535,2863311531,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 @@ -551,9 +973,8 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $3, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 @@ -563,21 +984,20 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_nomulinv: +; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2863311531,1374389535,2863311531,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -587,9 +1007,9 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_nomulinv: +; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,1374389535,2863311531,2454267027] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -605,9 +1025,9 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_nomulinv: +; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,1374389535,2863311531,2454267027] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -622,232 +1042,281 @@ define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone { ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_comp(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_comp: +; One all-ones divisor and one power-of-two divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pslld $2, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,4294967295,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_comp: +; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_comp: +; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_comp: +; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_comp: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_both: +;------------------------------------------------------------------------------; +; Negative tests - the fold is invalid if any divisor is 1. +;------------------------------------------------------------------------------; + +; One divisor in odd divisor +define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2863311531,u,2863311531,u> +; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE2-NEXT: movd %eax, %xmm1 ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: psrld $3, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_both: +; CHECK-SSE41-LABEL: test_urem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2863311531,u,2863311531,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_both: +; CHECK-AVX1-LABEL: test_urem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_both: +; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_both: +; CHECK-AVX512VL-LABEL: test_urem_odd_one: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} - -define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { -; CHECK-SSE-LABEL: test_urem_div_undef: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX-LABEL: test_urem_div_undef: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_comp_undef: +; One divisor in even divisors +define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pslld $2, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_comp_undef: +; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -855,15 +1324,17 @@ define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_comp_undef: +; CHECK-AVX1-LABEL: test_urem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -871,114 +1342,117 @@ define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_comp_undef: +; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_comp_undef: +; CHECK-AVX512VL-LABEL: test_urem_even_one: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} - -define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { -; CHECK-SSE-LABEL: test_urem_both_undef: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX-LABEL: test_urem_both_undef: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_div_even_odd(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_div_even_odd: +; One divisor in odd-even divisors +define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,6,6] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [25,100,1,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_div_even_odd: +; CHECK-SSE41-LABEL: test_urem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,2863311531,2863311531] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-SSE41-NEXT: movd %eax, %xmm1 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm1 +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_div_even_odd: +; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,3435973837,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -986,16 +1460,18 @@ define <4 x i32> @test_urem_div_even_odd(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_div_even_odd: +; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,3435973837,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1003,23 +1479,25 @@ define <4 x i32> @test_urem_div_even_odd(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_div_even_odd: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,3435973837,2863311531,2863311531] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret diff --git a/test/CodeGen/X86/urem-seteq-vec-splat.ll b/test/CodeGen/X86/urem-seteq-vec-splat.ll index bc969205041..e7475eec344 100644 --- a/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -5,12 +5,11 @@ ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL -; Tests BuildUREMEqFold for 4 x i32 splat vectors with odd divisor. -; See urem-seteq.ll for justification behind constants emitted. -define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_odd_vec_i32: +; Odd divisor +define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -22,16 +21,16 @@ define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_vec_i32: +; CHECK-SSE41-LABEL: test_urem_odd_25: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] ; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_vec_i32: +; CHECK-AVX1-LABEL: test_urem_odd_25: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 @@ -39,121 +38,172 @@ define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_vec_i32: +; CHECK-AVX2-LABEL: test_urem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] ; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_vec_i32: +; CHECK-AVX512VL-LABEL: test_urem_odd_25: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Like test_urem_odd_vec_i32, but with 8 x i16 vectors. -define <8 x i16> @test_urem_odd_vec_i16(<8 x i16> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_odd_vec_i16: +; Even divisors +define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psubusw {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrlw $15, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_vec_i16: +; CHECK-SSE41-LABEL: test_urem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] -; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrlw $15, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_vec_i16: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %urem = urem <8 x i16> %X, - %cmp = icmp eq <8 x i16> %urem, - %ret = zext <8 x i1> %cmp to <8 x i16> - ret <8 x i16> %ret +; CHECK-AVX1-LABEL: test_urem_even_100: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_100: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_100: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret } -; Tests BuildUREMEqFold for 4 x i32 splat vectors with even divisor. -; The expected behavior is that the fold is _not_ applied -; because it requires a ROTR in the even case, which has to be expanded. -define <4 x i32> @test_urem_even_vec_i32(<4 x i32> %X) nounwind readnone { -; CHECK-SSE2-LABEL: test_urem_even_vec_i32: +;------------------------------------------------------------------------------; +; Comparison constant has undef elements. +;------------------------------------------------------------------------------; + +define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_undef1: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,14,14,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_even_vec_i32: +; CHECK-SSE41-LABEL: test_urem_odd_undef1: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_even_vec_i32: +; CHECK-AVX1-LABEL: test_urem_odd_undef1: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -161,17 +211,16 @@ define <4 x i32> @test_urem_even_vec_i32(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_even_vec_i32: +; CHECK-AVX2-LABEL: test_urem_odd_undef1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [14,14,14,14] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] ; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -179,72 +228,144 @@ define <4 x i32> @test_urem_even_vec_i32(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_even_vec_i32: +; CHECK-AVX512VL-LABEL: test_urem_odd_undef1: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; Like test_urem_even_vec_i32, but with 8 x i16 vectors. -define <8 x i16> @test_urem_even_vec_i16(<8 x i16> %X) nounwind readnone { -; CHECK-SSE-LABEL: test_urem_even_vec_i16: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE-NEXT: psrlw $1, %xmm1 -; CHECK-SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; CHECK-SSE-NEXT: psrlw $1, %xmm1 -; CHECK-SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; CHECK-SSE-NEXT: psubw %xmm1, %xmm0 -; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE-NEXT: psrlw $15, %xmm0 -; CHECK-SSE-NEXT: retq +define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_undef1: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_undef1: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_undef1: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_even_vec_i16: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; CHECK-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %urem = urem <8 x i16> %X, - %cmp = icmp eq <8 x i16> %urem, - %ret = zext <8 x i1> %cmp to <8 x i16> - ret <8 x i16> %ret +; CHECK-AVX2-LABEL: test_urem_even_undef1: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_undef1: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret } -; We should not proceed with this fold if the divisor is 1 or -1 -define <4 x i32> @test_urem_one_vec(<4 x i32> %X) nounwind readnone { -; CHECK-SSE-LABEL: test_urem_one_vec: +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_one: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_one_vec: +; CHECK-AVX1-LABEL: test_urem_one: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_one_vec: +; CHECK-AVX2-LABEL: test_urem_one: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_one_vec: +; CHECK-AVX512VL-LABEL: test_urem_one: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq @@ -254,10 +375,64 @@ define <4 x i32> @test_urem_one_vec(<4 x i32> %X) nounwind readnone { ret <4 x i32> %ret } -; BuildUREMEqFold does not work when the only odd factor of the divisor is 1. -; This ensures we don't touch powers of two. -define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone { -; CHECK-SSE-LABEL: test_urem_pow2_vec: +; We can lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_allones: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: psubd %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_allones: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_allones: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_allones: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_allones: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_pow2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 @@ -265,7 +440,7 @@ define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone { ; CHECK-SSE-NEXT: psrld $31, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_pow2_vec: +; CHECK-AVX1-LABEL: test_urem_pow2: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -273,7 +448,7 @@ define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_pow2_vec: +; CHECK-AVX2-LABEL: test_urem_pow2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] ; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -282,7 +457,7 @@ define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_pow2_vec: +; CHECK-AVX512VL-LABEL: test_urem_pow2: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/urem-seteq.ll b/test/CodeGen/X86/urem-seteq.ll index cd1cb3715a2..26b9e85feb2 100644 --- a/test/CodeGen/X86/urem-seteq.ll +++ b/test/CodeGen/X86/urem-seteq.ll @@ -2,11 +2,15 @@ ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64 +;------------------------------------------------------------------------------; +; Odd divisors +;------------------------------------------------------------------------------; + ; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. ; The corresponding pseudocode is: ; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] ; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] -define i32 @test_urem_odd(i32 %X) nounwind readnone { +define i32 @test_urem_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_odd: ; X86: # %bb.0: ; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD @@ -28,8 +32,30 @@ define i32 @test_urem_odd(i32 %X) nounwind readnone { ret i32 %ret } +define i32 @test_urem_odd_25(i32 %X) nounwind { +; X86-LABEL: test_urem_odd_25: +; X86: # %bb.0: +; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_odd_25: +; X64: # %bb.0: +; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 +; X64-NEXT: setb %al +; X64-NEXT: retq + %urem = urem i32 %X, 25 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ; This is like test_urem_odd, except the divisor has bit 30 set. -define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { +define i32 @test_urem_odd_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit30: ; X86: # %bb.0: ; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB @@ -52,7 +78,7 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { } ; This is like test_urem_odd, except the divisor has bit 31 set. -define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { +define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit31: ; X86: # %bb.0: ; X86-NEXT: imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB @@ -74,13 +100,17 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { ret i32 %ret } +;------------------------------------------------------------------------------; +; Even divisors +;------------------------------------------------------------------------------; + ; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. ; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. ; The corresponding pseudocode is: ; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] ; Q <- [Q >>rot K] <=> [Q >>rot 1] ; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] -define i16 @test_urem_even(i16 %X) nounwind readnone { +define i16 @test_urem_even(i16 %X) nounwind { ; X86-LABEL: test_urem_even: ; X86: # %bb.0: ; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 @@ -108,8 +138,32 @@ define i16 @test_urem_even(i16 %X) nounwind readnone { ret i16 %ret } +define i32 @test_urem_even_100(i32 %X) nounwind { +; X86-LABEL: test_urem_even_100: +; X86: # %bb.0: +; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 +; X86-NEXT: rorl $2, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_even_100: +; X64: # %bb.0: +; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 +; X64-NEXT: rorl $2, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 +; X64-NEXT: setb %al +; X64-NEXT: retq + %urem = urem i32 %X, 100 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ; This is like test_urem_even, except the divisor has bit 30 set. -define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { +define i32 @test_urem_even_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit30: ; X86: # %bb.0: ; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 @@ -134,7 +188,7 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { } ; This is like test_urem_odd, except the divisor has bit 31 set. -define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { +define i32 @test_urem_even_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit31: ; X86: # %bb.0: ; X86-NEXT: imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB @@ -158,8 +212,39 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { ret i32 %ret } -; We should not proceed with this fold if the divisor is 1 or -1 -define i32 @test_urem_one(i32 %X) nounwind readnone { +;------------------------------------------------------------------------------; +; Special case +;------------------------------------------------------------------------------; + +; 'NE' predicate is fine too. +define i32 @test_urem_odd_setne(i32 %X) nounwind { +; X86-LABEL: test_urem_odd_setne: +; X86: # %bb.0: +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: seta %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_odd_setne: +; X64: # %bb.0: +; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 +; X64-NEXT: seta %al +; X64-NEXT: retq + %urem = urem i32 %X, 5 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +;------------------------------------------------------------------------------; +; Negative tests +;------------------------------------------------------------------------------; + +; The fold is invalid if divisor is 1. +define i32 @test_urem_one(i32 %X) nounwind { ; CHECK-LABEL: test_urem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $1, %eax @@ -170,34 +255,32 @@ define i32 @test_urem_one(i32 %X) nounwind readnone { ret i32 %ret } -define i32 @test_urem_100(i32 %X) nounwind readnone { -; X86-LABEL: test_urem_100: +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_urem_allones(i32 %X) nounwind { +; X86-LABEL: test_urem_allones: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: rorl $2, %ecx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 +; X86-NEXT: cmpl $2, %ecx ; X86-NEXT: setb %al ; X86-NEXT: retl ; -; X64-LABEL: test_urem_100: +; X64-LABEL: test_urem_allones: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: rorl $2, %ecx +; X64-NEXT: negl %edi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 +; X64-NEXT: cmpl $2, %edi ; X64-NEXT: setb %al ; X64-NEXT: retq - %urem = urem i32 %X, 100 + %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret } -; We can lower remainder of division by powers of two much better elsewhere; -; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1. -; This ensures we don't touch powers of two. -define i32 @test_urem_pow2(i32 %X) nounwind readnone { +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_urem_pow2(i32 %X) nounwind { ; X86-LABEL: test_urem_pow2: ; X86: # %bb.0: ; X86-NEXT: xorl %eax, %eax