From: Nate Begeman Date: Sun, 20 Jun 2010 23:05:28 +0000 (+0000) Subject: Implement remaining codegen for NEON, all operations should now work. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4be54302da40d3e7cba3d93115f312d2fcca1879;p=clang Implement remaining codegen for NEON, all operations should now work. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@106407 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td index 7ffbfb4a46..b42755ca72 100644 --- a/include/clang/Basic/arm_neon.td +++ b/include/clang/Basic/arm_neon.td @@ -200,7 +200,7 @@ def VSLI_N : WInst<"dddi", "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">; //////////////////////////////////////////////////////////////////////////////// // E.3.14 Loads and stores of a single vector def VLD1 : WInst<"dc", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; -def VLD1_LANE : WInst<"dci", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; +def VLD1_LANE : WInst<"dcdi", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VLD1_DUP : WInst<"dc", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VST1 : WInst<"vpd", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VST1_LANE : WInst<"vpdi", "QUcQUsQUiQUlQcQsQiQlQhQfQPcQPsUcUsUiUlcsilhfPcPs">; @@ -213,9 +213,9 @@ def VLD4 : WInst<"4c", "QUcQUsQUiQcQsQiQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VLD2_DUP : WInst<"2c", "UcUsUiUlcsilhfPcPs">; def VLD3_DUP : WInst<"3c", "UcUsUiUlcsilhfPcPs">; def VLD4_DUP : WInst<"4c", "UcUsUiUlcsilhfPcPs">; -def VLD2_LANE : WInst<"2ci", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; -def VLD3_LANE : WInst<"3ci", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; -def VLD4_LANE : WInst<"4ci", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; +def VLD2_LANE : WInst<"2c2i", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; +def VLD3_LANE : WInst<"3c3i", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; +def VLD4_LANE : WInst<"4c4i", "QUsQUiQsQiQhQfQPsUcUsUicsihfPcPs">; def VST2 : WInst<"vp2", "QUcQUsQUiQcQsQiQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VST3 : WInst<"vp3", "QUcQUsQUiQcQsQiQhQfQPcQPsUcUsUiUlcsilhfPcPs">; def VST4 : WInst<"vp4", "QUcQUsQUiQcQsQiQhQfQPcQPsUcUsUiUlcsilhfPcPs">; diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 4c6a9b2316..8b19bc0ea0 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -948,17 +948,17 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, } } -const llvm::Type *GetNeonType(LLVMContext &Ctx, unsigned type, bool q) { +const llvm::VectorType *GetNeonType(LLVMContext &C, unsigned type, bool q) { switch (type) { default: break; case 0: - case 5: return llvm::VectorType::get(llvm::Type::getInt8Ty(Ctx), 8 << (int)q); + case 5: return llvm::VectorType::get(llvm::Type::getInt8Ty(C), 8 << (int)q); case 6: case 7: - case 1: return llvm::VectorType::get(llvm::Type::getInt16Ty(Ctx), 4 << (int)q); - case 2: return llvm::VectorType::get(llvm::Type::getInt32Ty(Ctx), 2 << (int)q); - case 3: return llvm::VectorType::get(llvm::Type::getInt64Ty(Ctx), 1 << (int)q); - case 4: return llvm::VectorType::get(llvm::Type::getFloatTy(Ctx), 2 << (int)q); + case 1: return llvm::VectorType::get(llvm::Type::getInt16Ty(C),4 << (int)q); + case 2: return llvm::VectorType::get(llvm::Type::getInt32Ty(C),2 << (int)q); + case 3: return llvm::VectorType::get(llvm::Type::getInt64Ty(C),1 << (int)q); + case 4: return llvm::VectorType::get(llvm::Type::getFloatTy(C),2 << (int)q); }; return 0; } @@ -1030,7 +1030,8 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, bool poly = (type & 0x7) == 5 || (type & 0x7) == 6; bool splat = false; - const llvm::Type *Ty = GetNeonType(VMContext, type & 0x7, quad); + const llvm::VectorType *VTy = GetNeonType(VMContext, type & 0x7, quad); + const llvm::Type *Ty = VTy; if (!Ty) return 0; @@ -1142,8 +1143,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, const llvm::Type *I32Ty = llvm::Type::getInt32Ty(VMContext); SmallVector Indices; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); - i != e; ++i) + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) Indices.push_back(ConstantInt::get(I32Ty, i+CV)); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); @@ -1171,7 +1171,122 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, case ARM::BI__builtin_neon_vhsubq_v: Int = usgn ? Intrinsic::arm_neon_vhsubu : Intrinsic::arm_neon_vhsubs; return EmitNeonCall(CGM.getIntrinsic(Int, &Ty, 1), Ops, "vhsub"); - // FIXME: vld* + case ARM::BI__builtin_neon_vld1_v: + case ARM::BI__builtin_neon_vld1q_v: + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vld1, &Ty, 1), + Ops, "vld1"); + case ARM::BI__builtin_neon_vld1_lane_v: + case ARM::BI__builtin_neon_vld1q_lane_v: + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ty = llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateLoad(Ops[0]); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane"); + case ARM::BI__builtin_neon_vld1_dup_v: + case ARM::BI__builtin_neon_vld1q_dup_v: { + Value *V = UndefValue::get(Ty); + Ty = llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateLoad(Ops[0]); + llvm::Constant *CI = ConstantInt::get(llvm::Type::getInt32Ty(VMContext), 0); + Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI); + return EmitNeonSplat(Ops[0], CI); + } + case ARM::BI__builtin_neon_vld2_v: + case ARM::BI__builtin_neon_vld2q_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2, &Ty, 1); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld2"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld3_v: + case ARM::BI__builtin_neon_vld3q_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3, &Ty, 1); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld3"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld4_v: + case ARM::BI__builtin_neon_vld4q_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4, &Ty, 1); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld4"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld2_lane_v: + case ARM::BI__builtin_neon_vld2q_lane_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2lane, &Ty, 1); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateBitCast(Ops[3], Ty); + Ops[1] = Builder.CreateCall(F, Ops.begin() + 1, Ops.end(), "vld2_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld3_lane_v: + case ARM::BI__builtin_neon_vld3q_lane_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3lane, &Ty, 1); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateBitCast(Ops[3], Ty); + Ops[4] = Builder.CreateBitCast(Ops[4], Ty); + Ops[1] = Builder.CreateCall(F, Ops.begin() + 1, Ops.end(), "vld3_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld4_lane_v: + case ARM::BI__builtin_neon_vld4q_lane_v: { + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4lane, &Ty, 1); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateBitCast(Ops[3], Ty); + Ops[4] = Builder.CreateBitCast(Ops[4], Ty); + Ops[5] = Builder.CreateBitCast(Ops[5], Ty); + Ops[1] = Builder.CreateCall(F, Ops.begin() + 1, Ops.end(), "vld3_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case ARM::BI__builtin_neon_vld2_dup_v: + case ARM::BI__builtin_neon_vld3_dup_v: + case ARM::BI__builtin_neon_vld4_dup_v: { + switch (BuiltinID) { + case ARM::BI__builtin_neon_vld2_dup_v: + Int = Intrinsic::arm_neon_vld2lane; + break; + case ARM::BI__builtin_neon_vld3_dup_v: + Int = Intrinsic::arm_neon_vld2lane; + break; + case ARM::BI__builtin_neon_vld4_dup_v: + Int = Intrinsic::arm_neon_vld2lane; + break; + default: assert(0 && "unknown vld_dup intrinsic?"); + } + Function *F = CGM.getIntrinsic(Int, &Ty, 1); + const llvm::StructType *STy = cast(F->getReturnType()); + + SmallVector Args; + Args.push_back(Ops[1]); + Args.append(STy->getNumElements(), UndefValue::get(Ty)); + + llvm::Constant *CI = ConstantInt::get(llvm::Type::getInt32Ty(VMContext), 0); + Args.push_back(CI); + + Ops[1] = Builder.CreateCall(F, Args.begin(), Args.end(), "vld_dup"); + // splat lane 0 to all elts in each vector of the result. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Value *Val = Builder.CreateExtractValue(Ops[1], i); + Value *Elt = Builder.CreateBitCast(Val, Ty); + Elt = EmitNeonSplat(Elt, CI); + Elt = Builder.CreateBitCast(Elt, Val->getType()); + Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i); + } + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } case ARM::BI__builtin_neon_vmax_v: case ARM::BI__builtin_neon_vmaxq_v: Int = usgn ? Intrinsic::arm_neon_vmaxu : Intrinsic::arm_neon_vmaxs; @@ -1474,50 +1589,67 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, ConstantAggregateZero::get(Ty)); return Builder.CreateSExt(Ops[0], Ty, "vtst"); } - // FIXME: transpose/zip/unzip don't currently match patterns for - // the non-q variants, but emitting 2 shufflevectors seems like a hack. case ARM::BI__builtin_neon_vtrn_v: case ARM::BI__builtin_neon_vtrnq_v: { - const llvm::Type *I32Ty = llvm::Type::getInt32Ty(VMContext); - SmallVector Indices; - unsigned nElts = cast(Ty)->getNumElements(); + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ty = llvm::Type::getInt32Ty(VMContext); + Value *SV; + for (unsigned vi = 0; vi != 2; ++vi) { - for (unsigned i = 0; i != nElts; i += 2) { - Indices.push_back(ConstantInt::get(I32Ty, i+vi)); - Indices.push_back(ConstantInt::get(I32Ty, i+nElts+vi)); + SmallVector Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { + Indices.push_back(ConstantInt::get(Ty, i+vi)); + Indices.push_back(ConstantInt::get(Ty, i+e+vi)); } + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn"); + SV = Builder.CreateStore(SV, Addr); } - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Value* SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); - return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vtrn"); + return SV; } case ARM::BI__builtin_neon_vuzp_v: case ARM::BI__builtin_neon_vuzpq_v: { - const llvm::Type *I32Ty = llvm::Type::getInt32Ty(VMContext); - SmallVector Indices; - unsigned nElts = cast(Ty)->getNumElements(); - for (unsigned vi = 0; vi != 2; ++vi) - for (unsigned i = 0; i != nElts; ++i) - Indices.push_back(ConstantInt::get(I32Ty, 2*i+vi)); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Value* SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); - return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vuzp"); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ty = llvm::Type::getInt32Ty(VMContext); + Value *SV; + + for (unsigned vi = 0; vi != 2; ++vi) { + SmallVector Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) + Indices.push_back(ConstantInt::get(Ty, 2*i+vi)); + + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp"); + SV = Builder.CreateStore(SV, Addr); + } + return SV; } case ARM::BI__builtin_neon_vzip_v: case ARM::BI__builtin_neon_vzipq_v: { - const llvm::Type *I32Ty = llvm::Type::getInt32Ty(VMContext); - SmallVector Indices; - unsigned nElts = cast(Ty)->getNumElements(); - for (unsigned i = 0; i != nElts; ++i) { - Indices.push_back(ConstantInt::get(I32Ty, i)); - Indices.push_back(ConstantInt::get(I32Ty, i+nElts)); - } - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Value* SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); - return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vzip"); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ty = llvm::Type::getInt32Ty(VMContext); + Value *SV; + + for (unsigned vi = 0; vi != 2; ++vi) { + SmallVector Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { + Indices.push_back(ConstantInt::get(Ty, (i >> 1))); + Indices.push_back(ConstantInt::get(Ty, (i >> 1)+e)); + } + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices.begin(), Indices.size()); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip"); + SV = Builder.CreateStore(SV, Addr); + } + return SV; } } }