From: Bjorn Pettersson Date: Mon, 29 Apr 2019 17:50:10 +0000 (+0000) Subject: [DAG] Refactor DAGCombiner::ReassociateOps X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=57b216445b260a206eb107301c5df1e312b04464;p=llvm [DAG] Refactor DAGCombiner::ReassociateOps Summary: Extract the logic for doing reassociations from DAGCombiner::reassociateOps into a helper function DAGCombiner::reassociateOpsCommutative, and use that helper to trigger reassociation on the original operand order, or the commuted operand order. Codegen is not identical since the operand order will be different when doing the reassociations for the commuted case. That causes some unfortunate churn in some test cases. Apart from that this should be NFC. Reviewers: spatel, craig.topper, tstellar Reviewed By: spatel Subscribers: dmgreen, dschuff, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, hiraditya, aheejin, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61199 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359476 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d2804c711a..df1929bf36b 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -458,7 +458,9 @@ namespace { SDValue visitFMULForFMADistributiveCombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); - SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, + SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, + SDValue N1); + SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); @@ -1000,53 +1002,50 @@ static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) { ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()); } -SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, - SDValue N1, SDNodeFlags Flags) { +// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression +// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. +SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, + SDValue N0, SDValue N1) { + EVT VT = N0.getValueType(); + + if (N0.getOpcode() != Opc) + return SDValue(); + // Don't reassociate reductions. - if (Flags.hasVectorReduction()) + if (N0->getFlags().hasVectorReduction()) return SDValue(); - EVT VT = N0.getValueType(); - if (N0.getOpcode() == Opc && !N0->getFlags().hasVectorReduction()) { - if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { - if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { - // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) - if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R)) - return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); - return SDValue(); - } - if (N0.hasOneUse()) { - // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one - // use - SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); - if (!OpNode.getNode()) - return SDValue(); - AddToWorklist(OpNode.getNode()); - return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); - } + if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { + if (SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { + // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) + if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, C1, C2)) + return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); + return SDValue(); } - } - - if (N1.getOpcode() == Opc && !N1->getFlags().hasVectorReduction()) { - if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) { - if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) { - // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) - if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L)) - return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); + if (N0.hasOneUse()) { + // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) + // iff (op x, c1) has one use + SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); + if (!OpNode.getNode()) return SDValue(); - } - if (N1.hasOneUse()) { - // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one - // use - SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0)); - if (!OpNode.getNode()) - return SDValue(); - AddToWorklist(OpNode.getNode()); - return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); - } + AddToWorklist(OpNode.getNode()); + return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } + return SDValue(); +} +// Try to reassociate commutative binops. +SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, + SDValue N1, SDNodeFlags Flags) { + assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); + // Don't reassociate reductions. + if (Flags.hasVectorReduction()) + return SDValue(); + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) + return Combined; + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) + return Combined; return SDValue(); } @@ -2193,7 +2192,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return NewSel; // reassociate add - if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) + if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) return RADD; // fold ((0-A) + B) -> B-A @@ -3275,7 +3274,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { N0.getOperand(1), N1)); // reassociate mul - if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; return SDValue(); @@ -4799,7 +4798,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return NewSel; // reassociate and - if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) return RAND; // Try to convert a constant mask AND into a shuffle clear mask. @@ -5525,7 +5524,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return BSwap; // reassociate or - if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) @@ -6412,7 +6411,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return NewSel; // reassociate xor - if (SDValue RXOR = ReassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) + if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) return RXOR; // fold !(x cc y) -> (x !cc y) diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll index 0009fe52e17..fa3f9d8fd38 100644 --- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -19,8 +19,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]] ; CHECK-NEXT: b.ne ; Next BB -; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], [[I2]] -; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], [[I1]] +; CHECK: add [[BLOCKBASE1:x[0-9]+]], [[I1]], [[BLOCKBASE]] +; CHECK-NEXT: add [[BLOCKBASE2:x[0-9]+]], [[I2]], [[BLOCKBASE]] ; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1] ; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1] ; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]] diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll index 4c148d938fe..9affc6a53c9 100644 --- a/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/test/CodeGen/AMDGPU/calling-conventions.ll @@ -184,14 +184,14 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; VI: s_and_b32 s1, s0, 0xffff0000 ; VI: s_add_i32 s0, s0, 1 ; VI: s_and_b32 s0, s0, 0xffff -; VI: s_or_b32 s0, s0, s1 +; VI: s_or_b32 s0, s1, s0 ; VI: s_add_i32 s0, s0, 0x10000 ; VI: v_mov_b32_e32 v0, s0 ; SI: s_lshl_b32 s1, s1, 16 ; SI: s_add_i32 s0, s0, 1 ; SI: s_and_b32 s0, s0, 0xffff -; SI: s_or_b32 s0, s0, s1 +; SI: s_or_b32 s0, s1, s0 ; SI: s_add_i32 s0, s0, 0x10000 define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { %add = add <2 x i16> %arg0, diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 53277027f58..c65d7fc02d9 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -289,18 +289,18 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: v_and_b32_e32 v7, s12, v7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v7, v6 +; SI-NEXT: v_or_b32_e32 v0, v6, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_and_b32_e32 v0, s12, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x900, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, s12, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -335,8 +335,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; VI-NEXT: v_add_u16_e32 v9, 9, v5 ; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, s8, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll index 3eb39afdb88..9f130347913 100644 --- a/test/CodeGen/AMDGPU/shl_add_constant.ll +++ b/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -71,7 +71,7 @@ define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8 ; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: ; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13 ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], s[[Y]], [[SHL3]] +; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], s[[Y]] ; SI: s_addk_i32 [[TMP]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] ; SI: buffer_store_dword [[VRESULT]] diff --git a/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/test/CodeGen/AMDGPU/widen-smrd-loads.ll index a3216422f18..4beebc0b34a 100644 --- a/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -216,7 +216,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) ; SI-NEXT: s_add_i32 s0, s0, 12 ; SI-NEXT: s_or_b32 s0, s0, 4 ; SI-NEXT: s_and_b32 s0, s0, 0xff -; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: s_addk_i32 s0, 0x2c00 ; SI-NEXT: s_or_b32 s0, s0, 0x300 ; SI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/test/CodeGen/ARM/and-load-combine.ll b/test/CodeGen/ARM/and-load-combine.ll index ef6a2ad7603..1bc90ba1b81 100644 --- a/test/CodeGen/ARM/and-load-combine.ll +++ b/test/CodeGen/ARM/and-load-combine.ll @@ -414,35 +414,35 @@ entry: define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, i32* nocapture readonly %b) { ; ARM-LABEL: cmp_and8_short_int: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrb r0, [r0] ; ARM-NEXT: ldrb r1, [r1] -; ARM-NEXT: and r0, r1, r0 +; ARM-NEXT: ldrb r0, [r0] +; ARM-NEXT: and r0, r0, r1 ; ARM-NEXT: clz r0, r0 ; ARM-NEXT: lsr r0, r0, #5 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_and8_short_int: ; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrb r0, [r0, #1] ; ARMEB-NEXT: ldrb r1, [r1, #3] -; ARMEB-NEXT: and r0, r1, r0 +; ARMEB-NEXT: ldrb r0, [r0, #1] +; ARMEB-NEXT: and r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 ; ARMEB-NEXT: lsr r0, r0, #5 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_and8_short_int: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrb r0, [r0] ; THUMB1-NEXT: ldrb r1, [r1] -; THUMB1-NEXT: ands r1, r0 -; THUMB1-NEXT: rsbs r0, r1, #0 -; THUMB1-NEXT: adcs r0, r1 +; THUMB1-NEXT: ldrb r2, [r0] +; THUMB1-NEXT: ands r2, r1 +; THUMB1-NEXT: rsbs r0, r2, #0 +; THUMB1-NEXT: adcs r0, r2 ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_and8_short_int: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrb r0, [r0] ; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: ldrb r0, [r0] ; THUMB2-NEXT: ands r0, r1 ; THUMB2-NEXT: clz r0, r0 ; THUMB2-NEXT: lsrs r0, r0, #5 @@ -846,7 +846,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) { ; ARM-LABEL: test6: ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: ldrb r0, [r0] -; ARM-NEXT: and r0, r0, r1 +; ARM-NEXT: and r0, r1, r0 ; ARM-NEXT: uxtb r1, r2 ; ARM-NEXT: sub r0, r0, r1 ; ARM-NEXT: clz r0, r0 @@ -856,7 +856,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) { ; ARMEB-LABEL: test6: ; ARMEB: @ %bb.0: @ %entry ; ARMEB-NEXT: ldrb r0, [r0] -; ARMEB-NEXT: and r0, r0, r1 +; ARMEB-NEXT: and r0, r1, r0 ; ARMEB-NEXT: uxtb r1, r2 ; ARMEB-NEXT: sub r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 @@ -893,7 +893,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) { ; ARM-LABEL: test7: ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: ldrb r0, [r0] -; ARM-NEXT: and r0, r0, r1 +; ARM-NEXT: and r0, r1, r0 ; ARM-NEXT: uxtb r1, r2 ; ARM-NEXT: sub r0, r0, r1 ; ARM-NEXT: clz r0, r0 @@ -903,7 +903,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) { ; ARMEB-LABEL: test7: ; ARMEB: @ %bb.0: @ %entry ; ARMEB-NEXT: ldrb r0, [r0, #1] -; ARMEB-NEXT: and r0, r0, r1 +; ARMEB-NEXT: and r0, r1, r0 ; ARMEB-NEXT: uxtb r1, r2 ; ARMEB-NEXT: sub r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 @@ -1550,34 +1550,34 @@ define arm_aapcscc i64 @test26(i64* nocapture %p) { ret i64 %and } +define void @test27(i32* nocapture %ptr) { ; ARM-LABEL: test27: -; ARM: @ %bb.0: +; ARM: @ %bb.0: @ %entry ; ARM-NEXT: ldrb r1, [r0, #1] ; ARM-NEXT: lsl r1, r1, #16 ; ARM-NEXT: str r1, [r0] ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test27: -; ARMEB: @ %bb.0: -; ARMEB-NEXT: ldrb r1, [r0, #2] -; ARMEB-NEXT: lsl r1, r1, #16 -; ARMEB-NEXT: str r1, [r0] -; ARMEB-NEXT: bx lr +; ARMEB: @ %bb.0: @ %entry +; ARMEB-NEXT: ldrb r1, [r0, #2] +; ARMEB-NEXT: lsl r1, r1, #16 +; ARMEB-NEXT: str r1, [r0] +; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test27: -; THUMB1: @ %bb.0: -; THUMB1-NEXT: ldrb r1, [r0, #1] -; THUMB1-NEXT: lsls r1, r1, #16 -; THUMB1-NEXT: str r1, [r0] -; THUMB1-NEXT: bx lr +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: ldrb r1, [r0, #1] +; THUMB1-NEXT: lsls r1, r1, #16 +; THUMB1-NEXT: str r1, [r0] +; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: test27: -; THUMB2: @ %bb.0: +; THUMB2: @ %bb.0: @ %entry ; THUMB2-NEXT: ldrb r1, [r0, #1] ; THUMB2-NEXT: lsls r1, r1, #16 ; THUMB2-NEXT: str r1, [r0] ; THUMB2-NEXT: bx lr -define void @test27(i32* nocapture %ptr) { entry: %0 = load i32, i32* %ptr, align 4 %and = and i32 %0, 65280 diff --git a/test/CodeGen/ARM/load-combine-big-endian.ll b/test/CodeGen/ARM/load-combine-big-endian.ll index c7c658ad70c..4e6e6a71937 100644 --- a/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/test/CodeGen/ARM/load-combine-big-endian.ll @@ -528,7 +528,7 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add r0, r0, r1 +; CHECK: add r0, r1, r0 ; CHECK-NEXT: mov r1, #65280 ; CHECK-NEXT: mov r2, #16711680 ; CHECK-NEXT: ldr r0, [r0, #13] @@ -540,7 +540,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6: add r0, r1, r0 ; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] ; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr diff --git a/test/CodeGen/ARM/load-combine.ll b/test/CodeGen/ARM/load-combine.ll index 72f4c5c7419..4206aad1d9e 100644 --- a/test/CodeGen/ARM/load-combine.ll +++ b/test/CodeGen/ARM/load-combine.ll @@ -479,12 +479,12 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add r0, r0, r1 +; CHECK: add r0, r1, r0 ; CHECK-NEXT: ldr r0, [r0, #13] ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6: add r0, r1, r0 ; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] ; CHECK-ARMv6-NEXT: bx lr %tmp = add nuw nsw i32 %i, 4 diff --git a/test/CodeGen/SystemZ/buildvector-00.ll b/test/CodeGen/SystemZ/buildvector-00.ll index 13360b2e577..f0d046c8874 100644 --- a/test/CodeGen/SystemZ/buildvector-00.ll +++ b/test/CodeGen/SystemZ/buildvector-00.ll @@ -13,7 +13,7 @@ define void @f1(<2 x i64> %a0) { ; CHECK-NEXT: vn %v0, %v0, %v0 ; CHECK-NEXT: vno %v2, %v2, %v2 ; CHECK-NEXT: vceqg %v0, %v0, %v1 -; CHECK-NEXT: vx %v0, %v2, %v0 +; CHECK-NEXT: vx %v0, %v0, %v2 ; CHECK-NEXT: vnc %v0, %v2, %v0 ; CHECK-NEXT: vlgvf %r0, %v0, 1 ; CHECK-NEXT: tmll %r0, 1 diff --git a/test/CodeGen/Thumb2/constant-hoisting.ll b/test/CodeGen/Thumb2/constant-hoisting.ll index fcb1de5f15f..a106900dc3e 100644 --- a/test/CodeGen/Thumb2/constant-hoisting.ll +++ b/test/CodeGen/Thumb2/constant-hoisting.ll @@ -17,16 +17,16 @@ define i32 @test_values(i32 %a, i32 %b) minsize optsize { ; CHECK-V6M-NEXT: adds r0, r1, r0 ; CHECK-V6M-NEXT: bx lr ; CHECK-V6M-NEXT: .LBB0_5: -; CHECK-V6M-NEXT: adds r0, r1, r0 +; CHECK-V6M-NEXT: adds r0, r0, r1 ; CHECK-V6M-NEXT: adds r0, r0, #4 ; CHECK-V6M-NEXT: .LBB0_6: ; CHECK-V6M-NEXT: bx lr ; CHECK-V6M-NEXT: .LBB0_7: -; CHECK-V6M-NEXT: adds r0, r1, r0 +; CHECK-V6M-NEXT: adds r0, r0, r1 ; CHECK-V6M-NEXT: adds r0, r0, #1 ; CHECK-V6M-NEXT: bx lr ; CHECK-V6M-NEXT: .LBB0_8: -; CHECK-V6M-NEXT: adds r0, r1, r0 +; CHECK-V6M-NEXT: adds r0, r0, r1 ; CHECK-V6M-NEXT: adds r0, r0, #2 ; CHECK-V6M-NEXT: bx lr ; CHECK-V6M-NEXT: .p2align 2 diff --git a/test/CodeGen/WebAssembly/address-offsets.ll b/test/CodeGen/WebAssembly/address-offsets.ll index 2d50769dc5c..4f522335907 100644 --- a/test/CodeGen/WebAssembly/address-offsets.ll +++ b/test/CodeGen/WebAssembly/address-offsets.ll @@ -165,10 +165,10 @@ define i32 @load_test9() { ; NON-PIC-NEXT: i32.load $push4=, 0($pop3){{$}} ; NON-PIC-NEXT: return $pop4{{$}} -; PIC-NEXT: global.get $push2=, g@GOT{{$}} ; PIC-NEXT: i32.const $push0=, 2{{$}} ; PIC-NEXT: i32.shl $push1=, $0, $pop0{{$}} -; PIC-NEXT: i32.add $push3=, $pop2, $pop1{{$}} +; PIC-NEXT: global.get $push2=, g@GOT{{$}} +; PIC-NEXT: i32.add $push3=, $pop1, $pop2{{$}} ; PIC-NEXT: i32.const $push4=, -40{{$}} ; PIC-NEXT: i32.add $push5=, $pop3, $pop4{{$}} ; PIC-NEXT: i32.load $push6=, 0($pop5){{$}} @@ -206,7 +206,7 @@ define i32 @load_test11_noinbounds(i32* %p) { ; CHECK-NEXT: .functype load_test12 (i32, i32) -> (i32){{$}} ; CHECK-NEXT: i32.const $push0=, 2{{$}} ; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}} +; CHECK-NEXT: i32.add $push2=, $pop1, $0{{$}} ; CHECK-NEXT: i32.const $push3=, 40{{$}} ; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}} @@ -222,7 +222,7 @@ define i32 @load_test12(i32* %p, i32 %n) { ; CHECK-NEXT: .functype load_test13 (i32, i32) -> (i32){{$}} ; CHECK-NEXT: i32.const $push0=, 2{{$}} ; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}} +; CHECK-NEXT: i32.add $push2=, $pop1, $0{{$}} ; CHECK-NEXT: i32.const $push3=, 40{{$}} ; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}} @@ -284,7 +284,7 @@ define i32 @load_test16(i32* %p, i32 %n) { ; CHECK-NEXT: .functype load_test17 (i32, i32) -> (i32){{$}} ; CHECK-NEXT: i32.const $push0=, 2{{$}} ; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}} +; CHECK-NEXT: i32.add $push2=, $pop1, $0{{$}} ; CHECK-NEXT: i32.const $push3=, 40{{$}} ; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}} @@ -314,7 +314,7 @@ define i32 @load_test18(i32* %p, i32 %n) { ; CHECK-NEXT: .functype load_test19 (i32, i32) -> (i32){{$}} ; CHECK-NEXT: i32.const $push0=, 2{{$}} ; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}} +; CHECK-NEXT: i32.add $push2=, $pop1, $0{{$}} ; CHECK-NEXT: i32.const $push3=, 40{{$}} ; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}} @@ -342,7 +342,7 @@ define i32 @load_test20(i32* %p) { ; CHECK-NEXT: .functype load_test21 (i32, i32) -> (i32){{$}} ; CHECK-NEXT: i32.const $push0=, 2{{$}} ; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}} +; CHECK-NEXT: i32.add $push2=, $pop1, $0{{$}} ; CHECK-NEXT: i32.const $push3=, -40{{$}} ; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}} @@ -501,10 +501,10 @@ define void @store_test9(i32 %i) { ; NON-PIC-NEXT: i32.const $push2=, g-40{{$}} ; NON-PIC-NEXT: i32.add $push3=, $pop1, $pop2{{$}} ; NON-PIC-NEXT: i32.store 0($pop3), $1{{$}} -; PIC-NEXT: global.get $push2=, g@GOT{{$}} ; PIC-NEXT: i32.const $push0=, 2{{$}} ; PIC-NEXT: i32.shl $push1=, $0, $pop0{{$}} -; PIC-NEXT: i32.add $push3=, $pop2, $pop1{{$}} +; PIC-NEXT: global.get $push2=, g@GOT{{$}} +; PIC-NEXT: i32.add $push3=, $pop1, $pop2{{$}} ; PIC-NEXT: i32.const $push4=, -40{{$}} ; PIC-NEXT: i32.add $push5=, $pop3, $pop4{{$}} ; PIC-NEXT: i32.store 0($pop5), $1{{$}} @@ -542,7 +542,7 @@ define void @store_test11_noinbounds(i32* %p, i32 %i) { ; CHECK-NEXT: .functype store_test12 (i32, i32, i32) -> (){{$}} ; NON-PIC-NEXT: i32.const $push0=, 2{{$}} ; NON-PIC-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; NON-PIC-NEXT: i32.add $push2=, $0, $pop1{{$}} +; NON-PIC-NEXT: i32.add $push2=, $pop1, $0{{$}} ; NON-PIC-NEXT: i32.const $push3=, 40{{$}} ; NON-PIC-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; NON-PIC-NEXT: i32.store 0($pop4), $2{{$}} @@ -558,7 +558,7 @@ define void @store_test12(i32* %p, i32 %n, i32 %i) { ; CHECK-NEXT: .functype store_test13 (i32, i32, i32) -> (){{$}} ; NON-PIC-NEXT: i32.const $push0=, 2{{$}} ; NON-PIC-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; NON-PIC-NEXT: i32.add $push2=, $0, $pop1{{$}} +; NON-PIC-NEXT: i32.add $push2=, $pop1, $0{{$}} ; NON-PIC-NEXT: i32.const $push3=, 40{{$}} ; NON-PIC-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; NON-PIC-NEXT: i32.store 0($pop4), $2{{$}} @@ -620,7 +620,7 @@ define void @store_test16(i32* %p, i32 %n, i32 %i) { ; CHECK-NEXT: .functype store_test17 (i32, i32, i32) -> (){{$}} ; NON-PIC-NEXT: i32.const $push0=, 2{{$}} ; NON-PIC-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; NON-PIC-NEXT: i32.add $push2=, $0, $pop1{{$}} +; NON-PIC-NEXT: i32.add $push2=, $pop1, $0{{$}} ; NON-PIC-NEXT: i32.const $push3=, 40{{$}} ; NON-PIC-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; NON-PIC-NEXT: i32.store 0($pop4), $2{{$}} @@ -650,7 +650,7 @@ define void @store_test18(i32* %p, i32 %n, i32 %i) { ; CHECK-NEXT: .functype store_test19 (i32, i32, i32) -> (){{$}} ; NON-PIC-NEXT: i32.const $push0=, 2{{$}} ; NON-PIC-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; NON-PIC-NEXT: i32.add $push2=, $0, $pop1{{$}} +; NON-PIC-NEXT: i32.add $push2=, $pop1, $0{{$}} ; NON-PIC-NEXT: i32.const $push3=, 40{{$}} ; NON-PIC-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; NON-PIC-NEXT: i32.store 0($pop4), $2{{$}} @@ -678,7 +678,7 @@ define void @store_test20(i32* %p, i32 %i) { ; CHECK-NEXT: .functype store_test21 (i32, i32, i32) -> (){{$}} ; NON-PIC-NEXT: i32.const $push0=, 2{{$}} ; NON-PIC-NEXT: i32.shl $push1=, $1, $pop0{{$}} -; NON-PIC-NEXT: i32.add $push2=, $0, $pop1{{$}} +; NON-PIC-NEXT: i32.add $push2=, $pop1, $0{{$}} ; NON-PIC-NEXT: i32.const $push3=, -40{{$}} ; NON-PIC-NEXT: i32.add $push4=, $pop2, $pop3{{$}} ; NON-PIC-NEXT: i32.store 0($pop4), $2{{$}} diff --git a/test/CodeGen/X86/add-ext.ll b/test/CodeGen/X86/add-ext.ll index 16646fa71ca..0675cd5eb21 100644 --- a/test/CodeGen/X86/add-ext.ll +++ b/test/CodeGen/X86/add-ext.ll @@ -26,7 +26,7 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) { ; CHECK-LABEL: add_nsw_sext_add: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq 5(%rsi,%rax), %rax +; CHECK-NEXT: leaq 5(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -73,7 +73,7 @@ define i8* @gep8(i32 %i, i8* %x) { ; CHECK-LABEL: gep8: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq 5(%rsi,%rax), %rax +; CHECK-NEXT: leaq 5(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -128,7 +128,7 @@ define i128* @gep128(i32 %i, i128* %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: leaq 80(%rsi,%rax), %rax +; CHECK-NEXT: leaq 80(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -169,12 +169,13 @@ define void @PR20134(i32* %a, i32 %i) { ; The same as @PR20134 but sign extension is replaced with zero extension define void @PR20134_zext(i32* %a, i32 %i) { -; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx -; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx -; CHECK-NEXT: movl %ecx, (%rdi,%rax,4) -; CHECK-NEXT: retq +; CHECK-LABEL: PR20134_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx +; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx +; CHECK-NEXT: movl %ecx, (%rdi,%rax,4) +; CHECK-NEXT: retq %add1 = add nuw i32 %i, 1 %idx1 = zext i32 %add1 to i64 diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll index 98fc16ca226..052c96fd215 100644 --- a/test/CodeGen/X86/combine-multiplies.ll +++ b/test/CodeGen/X86/combine-multiplies.ll @@ -38,10 +38,10 @@ define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190 -; CHECK-NEXT: leal (%eax,%edx), %esi +; CHECK-NEXT: leal (%edx,%eax), %esi ; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4) -; CHECK-NEXT: movl $22, 2080(%eax,%edx) -; CHECK-NEXT: movl $33, 10080(%eax,%edx) +; CHECK-NEXT: movl $22, 2080(%edx,%eax) +; CHECK-NEXT: movl $33, 10080(%edx,%eax) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl entry: diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll index 8c69dba8c3b..99e444eebee 100644 --- a/test/CodeGen/X86/load-combine.ll +++ b/test/CodeGen/X86/load-combine.ll @@ -966,7 +966,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: movl 13(%rdi,%rax), %eax +; CHECK64-NEXT: movl 13(%rax,%rdi), %eax ; CHECK64-NEXT: retq %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 @@ -1016,7 +1016,7 @@ define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zaext_loads: @@ -1072,7 +1072,7 @@ define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zsext_loads: diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll index 7a266235109..d3e758e7c74 100644 --- a/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -66,7 +66,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: movzbl 2(%r8,%rbx,4), %ebx ; GENERIC-NEXT: shll $16, %ebx ; GENERIC-NEXT: orl %eax, %ebx -; GENERIC-NEXT: xorl 16(%rdx,%rcx), %ebx +; GENERIC-NEXT: xorl 16(%rcx,%rdx), %ebx ; GENERIC-NEXT: shrl $8, %edi ; GENERIC-NEXT: movzbl 3(%r9,%rdi,4), %eax ; GENERIC-NEXT: shll $24, %eax @@ -74,7 +74,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: movzbl 2(%r8,%rdi,4), %edi ; GENERIC-NEXT: shll $16, %edi ; GENERIC-NEXT: orl %eax, %edi -; GENERIC-NEXT: xorl 20(%rdx,%rcx), %edi +; GENERIC-NEXT: xorl 20(%rcx,%rdx), %edi ; GENERIC-NEXT: movl %ebx, %eax ; GENERIC-NEXT: shrl $24, %eax ; GENERIC-NEXT: movb %al, (%rsi) @@ -156,8 +156,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: shll $16, %eax ; ATOM-NEXT: orl %edi, %ebp ; ATOM-NEXT: orl %r15d, %eax -; ATOM-NEXT: xorl 20(%rdx,%rcx), %ebp -; ATOM-NEXT: xorl 16(%rdx,%rcx), %eax +; ATOM-NEXT: xorl 20(%rcx,%rdx), %ebp +; ATOM-NEXT: xorl 16(%rcx,%rdx), %eax ; ATOM-NEXT: movl %eax, %edi ; ATOM-NEXT: shrl $16, %eax ; ATOM-NEXT: shrl $24, %edi diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll index f03175057fd..50f5542245a 100644 --- a/test/CodeGen/X86/merge_store.ll +++ b/test/CodeGen/X86/merge_store.ll @@ -44,7 +44,7 @@ entry: define void @indexed_store_merge(i64 %p, i8* %v) { ; CHECK-LABEL: indexed_store_merge: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $0, 2(%rsi,%rdi) +; CHECK-NEXT: movl $0, 2(%rdi,%rsi) ; CHECK-NEXT: movb $0, (%rsi) ; CHECK-NEXT: retq entry: diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index 2a8e62b05aa..317e5885192 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -1403,18 +1403,18 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movdqu (%rdx), %xmm0 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: movl $1, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqu (%rdx), %xmm1 -; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: psadbw %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: sad_unroll_nonzero_initial: @@ -1425,8 +1425,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX1-NEXT: movl $1, %eax ; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] @@ -1438,12 +1438,12 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX2: # %bb.0: # %bb ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] @@ -1458,12 +1458,12 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX512: # %bb.0: # %bb ; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu (%rdx), %xmm1 ; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/test/CodeGen/X86/vector-ext-logic.ll b/test/CodeGen/X86/vector-ext-logic.ll index 01c6c1a2fd5..f81721ed1aa 100644 --- a/test/CodeGen/X86/vector-ext-logic.ll +++ b/test/CodeGen/X86/vector-ext-logic.ll @@ -146,7 +146,7 @@ define <8 x i16> @zext_and_v8i16(<8 x i8> %x, <8 x i8> %y) { ; ; AVX2-LABEL: zext_and_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq %xz = zext <8 x i8> %x to <8 x i16>