* ``preserve_allcc``: code 15
* ``swiftcc`` : code 16
* ``cxx_fast_tlscc``: code 17
+ * ``tailcc`` : code 18
* ``x86_stdcallcc``: code 64
* ``x86_fastcallcc``: code 65
* ``arm_apcscc``: code 66
and PowerPC if:
* Caller and callee have the calling convention ``fastcc``, ``cc 10`` (GHC
- calling convention) or ``cc 11`` (HiPE calling convention).
+ calling convention), ``cc 11`` (HiPE calling convention), or ``tailcc``.
* The call is a tail call - in tail position (ret immediately follows call and
ret uses value of call or is void).
-* Option ``-tailcallopt`` is enabled.
+* Option ``-tailcallopt`` is enabled or the calling convention is ``tailcc``.
* Platform-specific constraints are met.
allows the target to use whatever tricks it wants to produce fast
code for the target, without having to conform to an externally
specified ABI (Application Binary Interface). `Tail calls can only
- be optimized when this, the GHC or the HiPE convention is
+ be optimized when this, the tailcc, the GHC or the HiPE convention is
used. <CodeGenerator.html#id80>`_ This calling convention does not
support varargs and requires the prototype of all callees to exactly
match the prototype of the function definition.
- On X86-64 RCX and R8 are available for additional integer returns, and
XMM2 and XMM3 are available for additional FP/vector returns.
- On iOS platforms, we use AAPCS-VFP calling convention.
+"``tailcc``" - Tail callable calling convention
+ This calling convention ensures that calls in tail position will always be
+ tail call optimized. This calling convention is equivalent to fastcc,
+ except for an additional guarantee that tail calls will be produced
+ whenever possible. `Tail calls can only be optimized when this, the fastcc,
+ the GHC or the HiPE convention is used. <CodeGenerator.html#id80>`_ This
+ calling convention does not support varargs and requires the prototype of
+ all callees to exactly match the prototype of the function definition.
"``cc <n>``" - Numbered convention
Any calling convention may be specified by number, allowing
target-specific calling conventions to be used. Target specific
Tail call optimization for calls marked ``tail`` is guaranteed to occur if
the following conditions are met:
- - Caller and callee both have the calling convention ``fastcc``.
+ - Caller and callee both have the calling convention ``fastcc`` or ``tailcc``.
- The call is in tail position (ret immediately follows call and ret
uses value of call or is void).
- - Option ``-tailcallopt`` is enabled, or
- ``llvm::GuaranteedTailCallOpt`` is ``true``.
+ - Option ``-tailcallopt`` is enabled,
+ ``llvm::GuaranteedTailCallOpt`` is ``true``, or the calling convention
+ is ``tailcc``
- `Platform-specific constraints are
met. <CodeGenerator.html#tailcallopt>`_
// CXX_FAST_TLS - Calling convention for access functions.
CXX_FAST_TLS = 17,
+ /// Tail - This calling convention attemps to make calls as fast as
+ /// possible while guaranteeing that tail call optimization can always
+ /// be performed.
+ Tail = 18,
+
// Target - This is the start of the target-specific calling conventions,
// e.g. fastcall and thiscall on X86.
FirstTargetCC = 64,
KEYWORD(amdgpu_ps);
KEYWORD(amdgpu_cs);
KEYWORD(amdgpu_kernel);
+ KEYWORD(tailcc);
KEYWORD(cc);
KEYWORD(c);
/// ::= 'amdgpu_ps'
/// ::= 'amdgpu_cs'
/// ::= 'amdgpu_kernel'
+/// ::= 'tailcc'
/// ::= 'cc' UINT
///
bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
case lltok::kw_amdgpu_ps: CC = CallingConv::AMDGPU_PS; break;
case lltok::kw_amdgpu_cs: CC = CallingConv::AMDGPU_CS; break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
+ case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_cc: {
Lex.Lex();
return ParseUInt32(CC);
kw_amdgpu_ps,
kw_amdgpu_cs,
kw_amdgpu_kernel,
+ kw_tailcc,
// Attributes:
kw_attributes,
// longjmp on x86), it can end up causing miscompilation that has not
// been fully understood.
if (!Ret &&
- (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
+ ((!TM.Options.GuaranteedTailCallOpt &&
+ CS.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
return false;
// If I will have a chain, make sure no other instruction that will have a
case CallingConv::PreserveAll: Out << "preserve_allcc"; break;
case CallingConv::CXX_FAST_TLS: Out << "cxx_fast_tlscc"; break;
case CallingConv::GHC: Out << "ghccc"; break;
+ case CallingConv::Tail: Out << "tailcc"; break;
case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break;
case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
+ CC != CallingConv::Tail &&
CC != CallingConv::X86_FastCall &&
CC != CallingConv::X86_StdCall &&
CC != CallingConv::X86_ThisCall &&
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Let SDISel handle vararg functions.
if (Subtarget->getTargetTriple().isOSMSVCRT())
return 0;
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE)
+ CC == CallingConv::HiPE || CC == CallingConv::Tail)
return 0;
if (CS)
default: return false;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::WebKit_JS:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Don't know how to handle Win64 varargs yet. Nothing special needed for
bool IsNested = HasNestArgument(&MF);
if (CallingConvention == CallingConv::X86_FastCall ||
- CallingConvention == CallingConv::Fast) {
+ CallingConvention == CallingConv::Fast ||
+ CallingConvention == CallingConv::Tail) {
if (IsNested)
report_fatal_error("Segmented stacks does not support fastcall with "
"nested function.");
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
- CC == CallingConv::HHVM);
+ CC == CallingConv::HHVM || CC == CallingConv::Tail);
}
/// Return true if we might ever do TCO for calls with this calling convention.
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
- return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+ return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
+ bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+ CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
if (Attr.getValueAsString() == "true")
isTailCall = false;
- if (Subtarget.isPICStyleGOT() &&
- !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
- if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ if (!IsGuaranteeTCO && isTailCall)
IsSibcall = true;
if (isTailCall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
- else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- canGuaranteeTCO(CallConv))
+ else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+ bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+ CalleeCC == CallingConv::Tail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
if (IsCalleeWin64 != IsCallerWin64)
return false;
- if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (IsGuaranteeTCO) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
+ case CallingConv::Tail:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::EAX;
// On Win64, all these conventions just use the default convention.
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32
+
+; tailcc will turn all of these musttail calls into tail calls.
+
+declare tailcc i32 @tailcallee(i32 %a1, i32 %a2)
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp tailcallee # TAILCALL
+;
+; X32-LABEL: tailcaller:
+; X32: # %bb.0: # %entry
+; X32-NEXT: jmp tailcallee # TAILCALL
+entry:
+ %tmp11 = musttail call tailcc i32 @tailcallee(i32 %in1, i32 %in2)
+ ret i32 %tmp11
+}
+
+declare tailcc i8* @alias_callee()
+
+define tailcc noalias i8* @noalias_caller() nounwind {
+; X64-LABEL: noalias_caller:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp alias_callee # TAILCALL
+;
+; X32-LABEL: noalias_caller:
+; X32: # %bb.0:
+; X32-NEXT: jmp alias_callee # TAILCALL
+ %p = musttail call tailcc i8* @alias_callee()
+ ret i8* %p
+}
+
+declare tailcc noalias i8* @noalias_callee()
+
+define tailcc i8* @alias_caller() nounwind {
+; X64-LABEL: alias_caller:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp noalias_callee # TAILCALL
+;
+; X32-LABEL: alias_caller:
+; X32: # %bb.0:
+; X32-NEXT: jmp noalias_callee # TAILCALL
+ %p = musttail call tailcc noalias i8* @noalias_callee()
+ ret i8* %p
+}
+
+define tailcc void @void_test(i32, i32, i32, i32) {
+; X64-LABEL: void_test:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: jmp void_test # TAILCALL
+;
+; X32-LABEL: void_test:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: jmp void_test # TAILCALL
+ entry:
+ musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+ ret void
+}
+
+define tailcc i1 @i1test(i32, i32, i32, i32) {
+; X64-LABEL: i1test:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: jmp i1test # TAILCALL
+;
+; X32-LABEL: i1test:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: jmp i1test # TAILCALL
+ entry:
+ %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+ ret i1 %4
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32
+
+; With -tailcallopt, CodeGen guarantees a tail call optimization
+; for all of these.
+
+declare tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl %edi, %edx
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp tailcallee # TAILCALL
+;
+; X32-LABEL: tailcaller:
+; X32: # %bb.0: # %entry
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: jmp tailcallee # TAILCALL
+entry:
+ %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+ ret i32 %tmp11
+}
+
+declare tailcc i8* @alias_callee()
+
+define tailcc noalias i8* @noalias_caller() nounwind {
+; X64-LABEL: noalias_caller:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp alias_callee # TAILCALL
+;
+; X32-LABEL: noalias_caller:
+; X32: # %bb.0:
+; X32-NEXT: jmp alias_callee # TAILCALL
+ %p = tail call tailcc i8* @alias_callee()
+ ret i8* %p
+}
+
+declare tailcc noalias i8* @noalias_callee()
+
+define tailcc i8* @alias_caller() nounwind {
+; X64-LABEL: alias_caller:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp noalias_callee # TAILCALL
+;
+; X32-LABEL: alias_caller:
+; X32: # %bb.0:
+; X32-NEXT: jmp noalias_callee # TAILCALL
+ %p = tail call tailcc noalias i8* @noalias_callee()
+ ret i8* %p
+}
+
+declare tailcc i32 @i32_callee()
+
+define tailcc i32 @ret_undef() nounwind {
+; X64-LABEL: ret_undef:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp i32_callee # TAILCALL
+;
+; X32-LABEL: ret_undef:
+; X32: # %bb.0:
+; X32-NEXT: jmp i32_callee # TAILCALL
+ %p = tail call tailcc i32 @i32_callee()
+ ret i32 undef
+}
+
+declare tailcc void @does_not_return()
+
+define tailcc i32 @noret() nounwind {
+; X64-LABEL: noret:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: popq %rax
+; X64-NEXT: jmp does_not_return # TAILCALL
+;
+; X32-LABEL: noret:
+; X32: # %bb.0:
+; X32-NEXT: jmp does_not_return # TAILCALL
+ tail call tailcc void @does_not_return()
+ unreachable
+}
+
+define tailcc void @void_test(i32, i32, i32, i32) {
+; X64-LABEL: void_test:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: jmp void_test # TAILCALL
+;
+; X32-LABEL: void_test:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: jmp void_test # TAILCALL
+ entry:
+ tail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+ ret void
+}
+
+define tailcc i1 @i1test(i32, i32, i32, i32) {
+; X64-LABEL: i1test:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: jmp i1test # TAILCALL
+;
+; X32-LABEL: i1test:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: jmp i1test # TAILCALL
+ entry:
+ %4 = tail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+ ret i1 %4
+}
--- /dev/null
+; RUN: llc -mcpu=core < %s | FileCheck %s
+
+target triple = "i686-apple-darwin"
+
+declare tailcc void @foo(i32, i32, i32, i32, i32, i32)
+declare i32* @bar(i32*)
+
+define tailcc void @hoge(i32 %b) nounwind {
+; Do not overwrite pushed callee-save registers
+; CHECK: pushl
+; CHECK: subl $[[SIZE:[0-9]+]], %esp
+; CHECK-NOT: [[SIZE]](%esp)
+ %a = alloca i32
+ store i32 0, i32* %a
+ %d = tail call i32* @bar(i32* %a) nounwind
+ store i32 %b, i32* %d
+ tail call tailcc void @foo(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) nounwind
+ ret void
+}
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=NO-OPTION
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls | FileCheck %s --check-prefix=DISABLE-TRUE
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls=false | FileCheck %s --check-prefix=DISABLE-FALSE
+
+; Check that command line option "-disable-tail-calls" overrides function
+; attribute "disable-tail-calls".
+
+; NO-OPTION-LABEL: {{\_?}}func_attr
+; NO-OPTION: callq {{\_?}}callee
+
+; DISABLE-FALSE-LABEL: {{\_?}}func_attr
+; DISABLE-FALSE: jmp {{\_?}}callee
+
+; DISABLE-TRUE-LABEL: {{\_?}}func_attr
+; DISABLE-TRUE: callq {{\_?}}callee
+
+define tailcc i32 @func_attr(i32 %a) #0 {
+entry:
+ %call = tail call tailcc i32 @callee(i32 %a)
+ ret i32 %call
+}
+
+; NO-OPTION-LABEL: {{\_?}}func_noattr
+; NO-OPTION: jmp {{\_?}}callee
+
+; DISABLE-FALSE-LABEL: {{\_?}}func_noattr
+; DISABLE-FALSE: jmp {{\_?}}callee
+
+; DISABLE-TRUE-LABEL: {{\_?}}func_noattr
+; DISABLE-TRUE: callq {{\_?}}callee
+
+define tailcc i32 @func_noattr(i32 %a) {
+entry:
+ %call = tail call tailcc i32 @callee(i32 %a)
+ ret i32 %call
+}
+
+declare tailcc i32 @callee(i32)
+
+attributes #0 = { "disable-tail-calls"="true" }
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -tailcallopt < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc -tailcallopt < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32
+
+; llc -tailcallopt should not enable tail calls from fastcc to tailcc or vice versa
+
+declare tailcc i32 @tailcallee1(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define fastcc i32 @tailcaller1(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl %edi, %edx
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: callq tailcallee1
+; X64-NEXT: retq $8
+;
+; X32-LABEL: tailcaller1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %edx
+; X32-NEXT: pushl %ecx
+; X32-NEXT: calll tailcallee1
+; X32-NEXT: retl
+entry:
+ %tmp11 = tail call tailcc i32 @tailcallee1(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+ ret i32 %tmp11
+}
+
+declare fastcc i32 @tailcallee2(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define tailcc i32 @tailcaller2(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl %edi, %edx
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: callq tailcallee2
+; X64-NEXT: retq $8
+;
+; X32-LABEL: tailcaller2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %edx
+; X32-NEXT: pushl %ecx
+; X32-NEXT: calll tailcallee2
+; X32-NEXT: retl
+entry:
+ %tmp11 = tail call fastcc i32 @tailcallee2(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+ ret i32 %tmp11
+}
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fast-isel -fast-isel-abort=1 | FileCheck %s
+
+%0 = type { i64, i32, i8* }
+
+define tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 %arg1) nounwind {
+fail: ; preds = %entry
+ %tmp20 = tail call tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 undef) ; <i8*> [#uses=1]
+; CHECK: jmp "_visit_array_aux<`Reference>" ## TAILCALL
+ ret i8* %tmp20
+}
+
+define i32 @foo() nounwind {
+entry:
+ %0 = tail call i32 (...) @bar() nounwind ; <i32> [#uses=1]
+ ret i32 %0
+}
+
+declare i32 @bar(...) nounwind
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false | FileCheck %s
+
+declare tailcc i32 @callee(i32 %arg)
+define tailcc i32 @directcall(i32 %arg) {
+entry:
+; This is the large code model, so &callee may not fit into the jmp
+; instruction. Instead, stick it into a register.
+; CHECK: movabsq $callee, [[REGISTER:%r[a-z0-9]+]]
+; CHECK: jmpq *[[REGISTER]] # TAILCALL
+ %res = tail call tailcc i32 @callee(i32 %arg)
+ ret i32 %res
+}
+
+; Check that the register used for an indirect tail call doesn't
+; clobber any of the arguments.
+define tailcc i32 @indirect_manyargs(i32(i32,i32,i32,i32,i32,i32,i32)* %target) {
+; Adjust the stack to enter the function. (The amount of the
+; adjustment may change in the future, in which case the location of
+; the stack argument and the return adjustment will change too.)
+; CHECK: pushq
+; Put the call target into R11, which won't be clobbered while restoring
+; callee-saved registers and won't be used for passing arguments.
+; CHECK: movq %rdi, %rax
+; Pass the stack argument.
+; CHECK: movl $7, 16(%rsp)
+; Pass the register arguments, in the right registers.
+; CHECK: movl $1, %edi
+; CHECK: movl $2, %esi
+; CHECK: movl $3, %edx
+; CHECK: movl $4, %ecx
+; CHECK: movl $5, %r8d
+; CHECK: movl $6, %r9d
+; Adjust the stack to "return".
+; CHECK: popq
+; And tail-call to the target.
+; CHECK: jmpq *%rax # TAILCALL
+ %res = tail call tailcc i32 %target(i32 1, i32 2, i32 3, i32 4, i32 5,
+ i32 6, i32 7)
+ ret i32 %res
+}
+
+; Check that the register used for a direct tail call doesn't clobber
+; any of the arguments.
+declare tailcc i32 @manyargs_callee(i32,i32,i32,i32,i32,i32,i32)
+define tailcc i32 @direct_manyargs() {
+; Adjust the stack to enter the function. (The amount of the
+; adjustment may change in the future, in which case the location of
+; the stack argument and the return adjustment will change too.)
+; CHECK: pushq
+; Pass the stack argument.
+; CHECK: movl $7, 16(%rsp)
+; This is the large code model, so &manyargs_callee may not fit into
+; the jmp instruction. Put it into a register which won't be clobbered
+; while restoring callee-saved registers and won't be used for passing
+; arguments.
+; CHECK: movabsq $manyargs_callee, %rax
+; Pass the register arguments, in the right registers.
+; CHECK: movl $1, %edi
+; CHECK: movl $2, %esi
+; CHECK: movl $3, %edx
+; CHECK: movl $4, %ecx
+; CHECK: movl $5, %r8d
+; CHECK: movl $6, %r9d
+; Adjust the stack to "return".
+; CHECK: popq
+; And tail-call to the target.
+; CHECK: jmpq *%rax # TAILCALL
+ %res = tail call tailcc i32 @manyargs_callee(i32 1, i32 2, i32 3, i32 4,
+ i32 5, i32 6, i32 7)
+ ret i32 %res
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-unknown-linux -no-x86-call-frame-opt | FileCheck %s
+; Linux has 8 byte alignment so the params cause stack size 20,
+; ensure that a normal tailcc call has matching stack size
+
+
+define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+ ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2, i32 %in3, i32 %in4) {
+ %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2,
+ i32 %in1, i32 %in2)
+ ret i32 %tmp11
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+ %tmp1 = call tailcc i32 @tailcaller( i32 1, i32 2, i32 3, i32 4 )
+ ; expect match subl [stacksize] here
+ ret i32 0
+}
+
+; CHECK: calll tailcaller
+; CHECK-NEXT: subl $12
--- /dev/null
+; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s
+define tailcc { { i8*, i8* }*, i8*} @init({ { i8*, i8* }*, i8*}, i32) {
+entry:
+ %2 = tail call tailcc { { i8*, i8* }*, i8* } @init({ { i8*, i8*}*, i8*} %0, i32 %1)
+ ret { { i8*, i8* }*, i8*} %2
+; CHECK: jmp init
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s
+%struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
+ i32, i32, i32, i32, i32, i32, i32, i32,
+ i32, i32, i32, i32, i32, i32, i32, i32 }
+
+define tailcc i32 @tailcallee(%struct.s* byval %a) nounwind {
+entry:
+ %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 0
+ %tmp3 = load i32, i32* %tmp2
+ ret i32 %tmp3
+; CHECK: tailcallee
+; CHECK: movl 4(%esp), %eax
+}
+
+define tailcc i32 @tailcaller(%struct.s* byval %a) nounwind {
+entry:
+ %tmp4 = tail call tailcc i32 @tailcallee(%struct.s* byval %a )
+ ret i32 %tmp4
+; CHECK: tailcaller
+; CHECK: jmp tailcallee
+}
--- /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+
+; FIXME: Win64 does not support byval.
+
+; Expect the entry point.
+; CHECK-LABEL: tailcaller:
+
+; Expect 2 rep;movs because of tail call byval lowering.
+; CHECK: rep;
+; CHECK: rep;
+
+; A sequence of copyto/copyfrom virtual registers is used to deal with byval
+; lowering appearing after moving arguments to registers. The following two
+; checks verify that the register allocator changes those sequences to direct
+; moves to argument register where it can (for registers that are not used in
+; byval lowering - not rsi, not rdi, not rcx).
+; Expect argument 4 to be moved directly to register edx.
+; CHECK: movl $7, %edx
+
+; Expect argument 6 to be moved directly to register r8.
+; CHECK: movl $17, %r8d
+
+; Expect not call but jmp to @tailcallee.
+; CHECK: jmp tailcallee
+
+; Expect the trailer.
+; CHECK: .size tailcaller
+
+%struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64,
+ i64, i64, i64, i64, i64, i64, i64, i64,
+ i64, i64, i64, i64, i64, i64, i64, i64 }
+
+declare tailcc i64 @tailcallee(%struct.s* byval %a, i64 %val, i64 %val2, i64 %val3, i64 %val4, i64 %val5)
+
+
+define tailcc i64 @tailcaller(i64 %b, %struct.s* byval %a) {
+entry:
+ %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 1
+ %tmp3 = load i64, i64* %tmp2, align 8
+ %tmp4 = tail call tailcc i64 @tailcallee(%struct.s* byval %a , i64 %tmp3, i64 %b, i64 7, i64 13, i64 17)
+ ret i64 %tmp4
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+define tailcc i32 @bar(i32 %X, i32(double, i32) *%FP) {
+ %Y = tail call tailcc i32 %FP(double 0.0, i32 %X)
+ ret i32 %Y
+; CHECK: jmpl
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+
+declare i32 @putchar(i32)
+
+define tailcc i32 @checktail(i32 %x, i32* %f, i32 %g) nounwind {
+; CHECK-LABEL: checktail:
+ %tmp1 = icmp sgt i32 %x, 0
+ br i1 %tmp1, label %if-then, label %if-else
+
+if-then:
+ %fun_ptr = bitcast i32* %f to i32(i32, i32*, i32)*
+ %arg1 = add i32 %x, -1
+ call i32 @putchar(i32 90)
+; CHECK: jmpl *%e{{.*}}
+ %res = tail call tailcc i32 %fun_ptr( i32 %arg1, i32 * %f, i32 %g)
+ ret i32 %res
+
+if-else:
+ ret i32 %x
+}
+
+
+define i32 @main() nounwind {
+ %f = bitcast i32 (i32, i32*, i32)* @checktail to i32*
+ %res = tail call tailcc i32 @checktail( i32 10, i32* %f,i32 10)
+ ret i32 %res
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
+
+; This test uses guaranteed TCO so these will be tail calls, despite the early
+; binding issues.
+
+define protected tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+entry:
+ ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) {
+entry:
+ %tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; <i32> [#uses=1]
+ ret i32 %tmp11
+; CHECK: jmp tailcallee
+}
--- /dev/null
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
+
+define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+entry:
+ ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) {
+entry:
+ %tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; <i32> [#uses=1]
+ ret i32 %tmp11
+; CHECK: movl tailcallee@GOT
+; CHECK: jmpl
+}
+
--- /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s
+
+; FIXME: Redundant unused stack allocation could be eliminated.
+; CHECK: subq ${{24|72|80}}, %rsp
+
+; Check that lowered arguments on the stack do not overwrite each other.
+; Add %in1 %p1 to a different temporary register (%eax).
+; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]]
+; Move param %in1 to temp register (%r10d).
+; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]]
+; Add %in1 %p1 to a different temporary register (%eax).
+; CHECK: addl {{%edi|%ecx}}, [[R1]]
+; Move param %in2 to stack.
+; CHECK-DAG: movl [[R2]], [[A1]](%rsp)
+; Move result of addition to stack.
+; CHECK-DAG: movl [[R1]], [[A2]](%rsp)
+; Eventually, do a TAILCALL
+; CHECK: TAILCALL
+
+declare tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %a, i32 %b) nounwind
+
+define tailcc i32 @tailcaller(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in1, i32 %in2) nounwind {
+entry:
+ %tmp = add i32 %in1, %p1
+ %retval = tail call tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in2,i32 %tmp)
+ ret i32 %retval
+}
\ externally_initialized
\ extern_weak
\ fastcc
+ \ tailcc
\ filter
\ from
\ gc