return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
}]>;
+def as_i64imm: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
+}]>;
+
def IMM8bit : PatLeaf <(imm),
[{return isUInt<8>(N->getZExtValue());}]
>;
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
-
let isCodeGenOnly = 1, isPseudo = 1 in {
-def V_MOV_I1 : InstSI <
- (outs VReg_1:$dst),
- (ins i1imm:$src),
- "", [(set i1:$dst, (imm:$src))]
->;
-
-def V_AND_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (and i1:$src0, i1:$src1))]
->;
-
-def V_OR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (or i1:$src0, i1:$src1))]
->;
-
-def V_XOR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (xor i1:$src0, i1:$src1))]
->;
-
let hasSideEffects = 1 in {
def SGPR_USE : InstSI <(outs),(ins), "", []>;
}
(S_MOV_B64 InlineImm<i64>:$imm)
>;
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+ (i1 imm:$imm),
+ (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
/********** ===================== **********/
/********** Interpolation Paterns **********/
/********** ===================== **********/
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
>;
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisions still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
def : Pat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <
Next = std::next(I);
MachineInstr &MI = *I;
- if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_AND_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_AND_B32_e64));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_OR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_OR_B32_e64));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e64));
- continue;
- }
-
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
unsigned Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
continue;
}
- if (MI.getOpcode() != AMDGPU::COPY ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+ if (MI.getOpcode() != AMDGPU::COPY)
continue;
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
+
+ if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ continue;
- const TargetRegisterClass *DstRC =
- MRI.getRegClass(MI.getOperand(0).getReg());
- const TargetRegisterClass *SrcRC =
- MRI.getRegClass(MI.getOperand(1).getReg());
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
- .addOperand(MI.getOperand(0))
- .addImm(0)
- .addImm(-1)
- .addOperand(MI.getOperand(1));
+ I1Defs.push_back(Dst.getReg());
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+ if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+ if (DefInst->getOperand(1).isImm()) {
+ I1Defs.push_back(Dst.getReg());
+
+ int64_t Val = DefInst->getOperand(1).getImm();
+ assert(Val == 0 || Val == -1);
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+ .addOperand(Dst)
+ .addImm(Val);
+ MI.eraseFromParent();
+ continue;
+ }
+ }
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addOperand(Dst)
+ .addImm(0)
+ .addImm(-1)
+ .addOperand(Src);
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
- .addOperand(MI.getOperand(0))
- .addOperand(MI.getOperand(1))
- .addImm(0);
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
MI.eraseFromParent();
}
}
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.ceil.f64(double) nounwind readnone
declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
; SI: cmp_gt_i32
; SI: cndmask_b32
; SI: cndmask_b32
-; SI: cmp_gt_f64
-; SI: cndmask_b32
-; SI: cmp_ne_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI: v_cmp_o_f64
+; SI: v_cmp_neq_f64
+; SI: s_and_b64
+; SI: v_cmp_gt_f64
+; SI: s_and_b64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
; SI: v_add_f64
+; SI: s_endpgm
define void @fceil_f64(double addrspace(1)* %out, double %x) {
%y = call double @llvm.ceil.f64(double %x) nounwind readnone
store double %y, double addrspace(1)* %out
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.floor.f64(double) nounwind readnone
declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
; SI: cmp_gt_i32
; SI: cndmask_b32
; SI: cndmask_b32
-; SI: cmp_lt_f64
-; SI: cndmask_b32
-; SI: cmp_ne_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI: v_cmp_o_f64
+; SI: v_cmp_neq_f64
+; SI: s_and_b64
+; SI: v_cmp_lt_f64
+; SI: s_and_b64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
; SI: v_add_f64
+; SI: s_endpgm
define void @ffloor_f64(double addrspace(1)* %out, double %x) {
%y = call double @llvm.floor.f64(double %x) nounwind readnone
store double %y, double addrspace(1)* %out
; R600-DAG: SETNE_DX10
; R600-DAG: AND_INT
; R600-DAG: SETNE_INT
-; SI: v_cmp_o_f32
-; SI: v_cmp_neq_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_and_b32_e32
+
+; SI-DAG: v_cmp_o_f32_e32 vcc
+; SI-DAG: v_cmp_neq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
+; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
+; SI: buffer_store_dword [[VRESULT]]
define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp one float %a, %b
; R600-DAG: SETE_DX10
; R600-DAG: OR_INT
; R600-DAG: SETNE_INT
-; SI: v_cmp_u_f32
-; SI: v_cmp_eq_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI-DAG: v_cmp_u_f32_e32 vcc
+; SI-DAG: v_cmp_eq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; SI: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
+; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[OR]]
+; SI: buffer_store_dword [[VRESULT]]
define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ueq float %a, %b
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_gt_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ugt float %a, %b
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_ge_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp uge float %a, %b
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_lt_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ult float %a, %b
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_le_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ule float %a, %b
}
; FUNC-LABEL: {{^}}f64_one:
-; SI: v_cmp_o_f64
-; SI: v_cmp_neq_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_and_b32_e32
+; SI-DAG: v_cmp_o_f64_e32 vcc
+; SI-DAG: v_cmp_neq_f64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
+; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
+; SI: buffer_store_dword [[VRESULT]]
define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp one double %a, %b
; FUNC-LABEL: {{^}}f64_ueq:
; SI: v_cmp_u_f64
; SI: v_cmp_eq_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ueq double %a, %b
; FUNC-LABEL: {{^}}f64_ugt:
; SI: v_cmp_u_f64
; SI: v_cmp_gt_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ugt double %a, %b
; FUNC-LABEL: {{^}}f64_uge:
; SI: v_cmp_u_f64
; SI: v_cmp_ge_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp uge double %a, %b
; FUNC-LABEL: {{^}}f64_ult:
; SI: v_cmp_u_f64
; SI: v_cmp_lt_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ult double %a, %b
; FUNC-LABEL: {{^}}f64_ule:
; SI: v_cmp_u_f64
; SI: v_cmp_le_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: s_or_b64
+; SI: v_cndmask_b32
define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ule double %a, %b
ret void
}
+; FIXME: Should write to different SGPR pairs instead of copying to
+; VALU for i1 phi.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_lt_i32_e64 [[CMP_IF:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
+
+; SI: BB2_1:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_eq_i32_e64 [[CMP_ELSE:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
+
+; SI: v_cmp_ne_i32_e64 [[CMP_CMP:s\[[0-9]+:[0-9]+\]]], [[V_CMP]], 0
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
+; SI: buffer_store_dword [[RESULT]]
+define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp1 = icmp eq i32 %tid, 0
+ br i1 %tmp1, label %if, label %else
+
+if:
+ %gep.if = getelementptr i32 addrspace(1)* %a, i32 %tid
+ %a.val = load i32 addrspace(1)* %gep.if
+ %cmp.if = icmp eq i32 %a.val, 0
+ br label %endif
+
+else:
+ %gep.else = getelementptr i32 addrspace(1)* %b, i32 %tid
+ %b.val = load i32 addrspace(1)* %gep.else
+ %cmp.else = icmp slt i32 %b.val, 0
+ br label %endif
+
+endif:
+ %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
+ %ext = sext i1 %tmp4 to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
declare i32 @llvm.r600.read.tidig.x() #0
attributes #0 = { readnone }
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: @test_if
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
switch i32 %a, label %default [
i32 0, label %case0
end:
ret void
}
+
+; SI-LABEL: @simple_test_v_if
+; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+
+; SI: ; BB#1
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; SI: BB1_2:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %is.0 = icmp ne i32 %tid, 0
+ br i1 %is.0, label %store, label %exit
+
+store:
+ %gep = getelementptr i32 addrspace(1)* %dst, i32 %tid
+ store i32 999, i32 addrspace(1)* %gep
+ ret void
+
+exit:
+ ret void
+}
+
+; SI-LABEL: @simple_test_v_loop
+; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: s_cbranch_execz BB2_2
+
+; SI: ; BB#1:
+; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; SI: BB2_3:
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: v_cmp_eq_i32_e32 vcc,
+; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
+; SI: s_cbranch_execnz BB2_3
+
+define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+entry:
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %is.0 = icmp ne i32 %tid, 0
+ %limit = add i32 %tid, 64
+ br i1 %is.0, label %loop, label %exit
+
+loop:
+ %i = phi i32 [%tid, %entry], [%i.inc, %loop]
+ %gep.src = getelementptr i32 addrspace(1)* %src, i32 %i
+ %gep.dst = getelementptr i32 addrspace(1)* %dst, i32 %i
+ %load = load i32 addrspace(1)* %src
+ store i32 %load, i32 addrspace(1)* %gep.dst
+ %i.inc = add nsw i32 %i, 1
+ %cmp = icmp eq i32 %limit, %i.inc
+ br i1 %cmp, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; SI-LABEL: @multi_vcond_loop
+
+; Load loop limit from buffer
+; Branch to exit if uniformly not taken
+; SI: ; BB#0:
+; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
+; SI: v_cmp_gt_i32_e64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG]], [[OUTER_CMP_SREG]]
+; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
+; SI: s_cbranch_execz BB3_2
+
+; Initialize inner condition to false
+; SI: ; BB#1:
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
+
+; Clear exec bits for workitems that load -1s
+; SI: BB3_3:
+; SI: buffer_load_dword [[A:v[0-9]+]]
+; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
+; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
+; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
+; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
+; SI: s_cbranch_execz BB3_5
+
+; SI: BB#4:
+; SI: buffer_store_dword
+; SI: v_cmp_ge_i64_e32 vcc
+; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
+
+; SI: BB3_5:
+; SI: s_or_b64 exec, exec, [[ORNEG1]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
+; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI: s_cbranch_execnz BB3_3
+
+; SI: BB#6
+; SI: s_or_b64 exec, exec, [[COND_STATE]]
+
+; SI: BB3_2:
+; SI-NOT: [[COND_STATE]]
+; SI: s_endpgm
+
+define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+bb:
+ %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tmp4 = sext i32 %tmp to i64
+ %tmp5 = getelementptr inbounds i32 addrspace(1)* %arg3, i64 %tmp4
+ %tmp6 = load i32 addrspace(1)* %tmp5, align 4
+ %tmp7 = icmp sgt i32 %tmp6, 0
+ %tmp8 = sext i32 %tmp6 to i64
+ br i1 %tmp7, label %bb10, label %bb26
+
+bb10: ; preds = %bb, %bb20
+ %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
+ %tmp12 = add nsw i64 %tmp11, %tmp4
+ %tmp13 = getelementptr inbounds i32 addrspace(1)* %arg1, i64 %tmp12
+ %tmp14 = load i32 addrspace(1)* %tmp13, align 4
+ %tmp15 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp12
+ %tmp16 = load i32 addrspace(1)* %tmp15, align 4
+ %tmp17 = icmp ne i32 %tmp14, -1
+ %tmp18 = icmp ne i32 %tmp16, -1
+ %tmp19 = and i1 %tmp17, %tmp18
+ br i1 %tmp19, label %bb20, label %bb26
+
+bb20: ; preds = %bb10
+ %tmp21 = add nsw i32 %tmp16, %tmp14
+ %tmp22 = getelementptr inbounds i32 addrspace(1)* %arg, i64 %tmp12
+ store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
+ %tmp23 = add nuw nsw i64 %tmp11, 1
+ %tmp24 = icmp slt i64 %tmp23, %tmp8
+ br i1 %tmp24, label %bb10, label %bb26
+
+bb26: ; preds = %bb10, %bb20, %bb
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
; FUNC-LABEL: {{^}}xor_i1:
; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
-; SI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-
+; SI-DAG: v_cmp_ge_f32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 0.0
+; SI-DAG: v_cmp_ge_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 1.0
+; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
%a = load float addrspace(1) * %in0
%b = load float addrspace(1) * %in1
%acmp = fcmp oge float %a, 0.000000e+00
- %bcmp = fcmp oge float %b, 0.000000e+00
+ %bcmp = fcmp oge float %b, 1.000000e+00
%xor = xor i1 %acmp, %bcmp
%result = select i1 %xor, float %a, float %b
store float %result, float addrspace(1)* %out
ret void
}
+; FUNC-LABEL: {{^}}v_xor_i1:
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
+; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
+; SI: buffer_store_byte [[RESULT]]
+define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+ %a = load i1 addrspace(1)* %in0
+ %b = load i1 addrspace(1)* %in1
+ %xor = xor i1 %a, %b
+ store i1 %xor, i1 addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}vector_xor_i32:
; SI: v_xor_b32_e32
define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {