From 2b074a4200135c633ba23aadb1d86b8b60b9c871 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Thu, 15 Nov 2018 17:31:16 +0000 Subject: [PATCH] [AST] Store the string data in StringLiteral in a trailing array of chars Use the newly available space in the bit-fields of Stmt and store the string data in a trailing array of chars after the trailing array of SourceLocation. This cuts the size of StringLiteral by 2 pointers. Also refactor slightly StringLiteral::Create and StringLiteral::CreateEmpty so that StringLiteral::Create is just responsible for the allocation, and the constructor is responsible for doing all the initialization. This match what is done for the other classes in general. This patch should have no other functional changes apart from this. A concern was raised during review about the interaction between this patch and serialization abbreviations. I believe however that there is currently no abbreviation defined for StringLiteral. The only statements/expressions which have abbreviations are currently DeclRefExpr, IntegerLiteral, CharacterLiteral and ImplicitCastExpr. Differential Revision: https://reviews.llvm.org/D54166 Reviewed By: dblaikie, rjmccall git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@346969 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/AST/Expr.h | 170 +++++++++++++++++----------- include/clang/AST/Stmt.h | 23 ++++ lib/AST/Expr.cpp | 132 ++++++++++----------- lib/Serialization/ASTReaderStmt.cpp | 48 +++++--- lib/Serialization/ASTWriterStmt.cpp | 18 ++- 5 files changed, 237 insertions(+), 154 deletions(-) diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h index b2dcad3b63..004631019c 100644 --- a/include/clang/AST/Expr.h +++ b/include/clang/AST/Expr.h @@ -1568,98 +1568,131 @@ public: /// char X[2] = "foobar"; /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". -class StringLiteral : public Expr { +class StringLiteral final + : public Expr, + private llvm::TrailingObjects { + friend class ASTStmtReader; + friend TrailingObjects; + + /// StringLiteral is followed by several trailing objects. They are in order: + /// + /// * A single unsigned storing the length in characters of this string. The + /// length in bytes is this length times the width of a single character. + /// Always present and stored as a trailing objects because storing it in + /// StringLiteral would increase the size of StringLiteral by sizeof(void *) + /// due to alignment requirements. If you add some data to StringLiteral, + /// consider moving it inside StringLiteral. + /// + /// * An array of getNumConcatenated() SourceLocation, one for each of the + /// token this string is made of. + /// + /// * An array of getByteLength() char used to store the string data. + public: enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 }; private: - friend class ASTStmtReader; + unsigned numTrailingObjects(OverloadToken) const { return 1; } + unsigned numTrailingObjects(OverloadToken) const { + return getNumConcatenated(); + } - union { - const char *asChar; - const uint16_t *asUInt16; - const uint32_t *asUInt32; - } StrData; - unsigned Length; - unsigned CharByteWidth : 4; - unsigned Kind : 3; - unsigned IsPascal : 1; - unsigned NumConcatenated; - SourceLocation TokLocs[1]; - - StringLiteral(QualType Ty) : - Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, - false) {} + unsigned numTrailingObjects(OverloadToken) const { + return getByteLength(); + } + + char *getStrDataAsChar() { return getTrailingObjects(); } + const char *getStrDataAsChar() const { return getTrailingObjects(); } + + const uint16_t *getStrDataAsUInt16() const { + return reinterpret_cast(getTrailingObjects()); + } + + const uint32_t *getStrDataAsUInt32() const { + return reinterpret_cast(getTrailingObjects()); + } + + /// Build a string literal. + StringLiteral(const ASTContext &Ctx, StringRef Str, StringKind Kind, + bool Pascal, QualType Ty, const SourceLocation *Loc, + unsigned NumConcatenated); + + /// Build an empty string literal. + StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); /// Map a target and string kind to the appropriate character width. static unsigned mapCharByteWidth(TargetInfo const &Target, StringKind SK); + /// Set one of the string literal token. + void setStrTokenLoc(unsigned TokNum, SourceLocation L) { + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + getTrailingObjects()[TokNum] = L; + } + public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, unsigned NumStrs); + const SourceLocation *Loc, + unsigned NumConcatenated); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); + return Create(Ctx, Str, Kind, Pascal, Ty, &Loc, 1); } /// Construct an empty string literal. - static StringLiteral *CreateEmpty(const ASTContext &C, unsigned NumStrs); + static StringLiteral *CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); StringRef getString() const { - assert(CharByteWidth==1 - && "This function is used in places that assume strings use char"); - return StringRef(StrData.asChar, getByteLength()); + assert(getCharByteWidth() == 1 && + "This function is used in places that assume strings use char"); + return StringRef(getStrDataAsChar(), getByteLength()); } /// Allow access to clients that need the byte representation, such as /// ASTWriterStmt::VisitStringLiteral(). StringRef getBytes() const { // FIXME: StringRef may not be the right type to use as a result for this. - if (CharByteWidth == 1) - return StringRef(StrData.asChar, getByteLength()); - if (CharByteWidth == 4) - return StringRef(reinterpret_cast(StrData.asUInt32), - getByteLength()); - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StringRef(reinterpret_cast(StrData.asUInt16), - getByteLength()); + return StringRef(getStrDataAsChar(), getByteLength()); } void outputString(raw_ostream &OS) const; uint32_t getCodeUnit(size_t i) const { - assert(i < Length && "out of bounds access"); - if (CharByteWidth == 1) - return static_cast(StrData.asChar[i]); - if (CharByteWidth == 4) - return StrData.asUInt32[i]; - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StrData.asUInt16[i]; + assert(i < getLength() && "out of bounds access"); + switch (getCharByteWidth()) { + case 1: + return static_cast(getStrDataAsChar()[i]); + case 2: + return getStrDataAsUInt16()[i]; + case 4: + return getStrDataAsUInt32()[i]; + } + llvm_unreachable("Unsupported character width!"); } - unsigned getByteLength() const { return CharByteWidth*Length; } - unsigned getLength() const { return Length; } - unsigned getCharByteWidth() const { return CharByteWidth; } - - /// Sets the string data to the given string data. - void setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal); - - StringKind getKind() const { return static_cast(Kind); } + unsigned getByteLength() const { return getCharByteWidth() * getLength(); } + unsigned getLength() const { return *getTrailingObjects(); } + unsigned getCharByteWidth() const { return StringLiteralBits.CharByteWidth; } + StringKind getKind() const { + return static_cast(StringLiteralBits.Kind); + } - bool isAscii() const { return Kind == Ascii; } - bool isWide() const { return Kind == Wide; } - bool isUTF8() const { return Kind == UTF8; } - bool isUTF16() const { return Kind == UTF16; } - bool isUTF32() const { return Kind == UTF32; } - bool isPascal() const { return IsPascal; } + bool isAscii() const { return getKind() == Ascii; } + bool isWide() const { return getKind() == Wide; } + bool isUTF8() const { return getKind() == UTF8; } + bool isUTF16() const { return getKind() == UTF16; } + bool isUTF32() const { return getKind() == UTF32; } + bool isPascal() const { return StringLiteralBits.IsPascal; } bool containsNonAscii() const { for (auto c : getString()) @@ -1677,15 +1710,14 @@ public: /// getNumConcatenated - Get the number of string literal tokens that were /// concatenated in translation phase #6 to form this string literal. - unsigned getNumConcatenated() const { return NumConcatenated; } + unsigned getNumConcatenated() const { + return StringLiteralBits.NumConcatenated; + } + /// Get one of the string literal token. SourceLocation getStrTokenLoc(unsigned TokNum) const { - assert(TokNum < NumConcatenated && "Invalid tok number"); - return TokLocs[TokNum]; - } - void setStrTokenLoc(unsigned TokNum, SourceLocation L) { - assert(TokNum < NumConcatenated && "Invalid tok number"); - TokLocs[TokNum] = L; + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + return getTrailingObjects()[TokNum]; } /// getLocationOfByte - Return a source location that points to the specified @@ -1702,14 +1734,18 @@ public: unsigned *StartTokenByteOffset = nullptr) const; typedef const SourceLocation *tokloc_iterator; - tokloc_iterator tokloc_begin() const { return TokLocs; } - tokloc_iterator tokloc_end() const { return TokLocs + NumConcatenated; } - SourceLocation getBeginLoc() const LLVM_READONLY { return TokLocs[0]; } - SourceLocation getEndLoc() const LLVM_READONLY { - return TokLocs[NumConcatenated - 1]; + tokloc_iterator tokloc_begin() const { + return getTrailingObjects(); + } + + tokloc_iterator tokloc_end() const { + return getTrailingObjects() + getNumConcatenated(); } + SourceLocation getBeginLoc() const LLVM_READONLY { return *tokloc_begin(); } + SourceLocation getEndLoc() const LLVM_READONLY { return *(tokloc_end() - 1); } + static bool classof(const Stmt *T) { return T->getStmtClass() == StringLiteralClass; } diff --git a/include/clang/AST/Stmt.h b/include/clang/AST/Stmt.h index 890a02948a..79a6646040 100644 --- a/include/clang/AST/Stmt.h +++ b/include/clang/AST/Stmt.h @@ -366,6 +366,28 @@ protected: unsigned IsExact : 1; }; + class StringLiteralBitfields { + friend class ASTStmtReader; + friend class StringLiteral; + + unsigned : NumExprBits; + + /// The kind of this string literal. + /// One of the enumeration values of StringLiteral::StringKind. + unsigned Kind : 3; + + /// The width of a single character in bytes. Only values of 1, 2, + /// and 4 bytes are supported. StringLiteral::mapCharByteWidth maps + /// the target + string kind to the appropriate CharByteWidth. + unsigned CharByteWidth : 3; + + unsigned IsPascal : 1; + + /// The number of concatenated token this string is made of. + /// This is the number of trailing SourceLocation. + unsigned NumConcatenated; + }; + class CharacterLiteralBitfields { friend class CharacterLiteral; @@ -566,6 +588,7 @@ protected: PredefinedExprBitfields PredefinedExprBits; DeclRefExprBitfields DeclRefExprBits; FloatingLiteralBitfields FloatingLiteralBits; + StringLiteralBitfields StringLiteralBits; CharacterLiteralBitfields CharacterLiteralBits; UnaryOperatorBitfields UnaryOperatorBits; UnaryExprOrTypeTraitExprBitfields UnaryExprOrTypeTraitExprBits; diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp index 37575e7ba2..b811b22239 100644 --- a/lib/AST/Expr.cpp +++ b/lib/AST/Expr.cpp @@ -912,42 +912,80 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target, return CharByteWidth; } -StringLiteral *StringLiteral::Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, - unsigned NumStrs) { - assert(C.getAsConstantArrayType(Ty) && +StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) + : Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, + false) { + assert(Ctx.getAsConstantArrayType(Ty) && "StringLiteral must be of constant array type!"); + unsigned CharByteWidth = mapCharByteWidth(Ctx.getTargetInfo(), Kind); + unsigned ByteLength = Str.size(); + assert((ByteLength % CharByteWidth == 0) && + "The size of the data must be a multiple of CharByteWidth!"); + + // Avoid the expensive division. The compiler should be able to figure it + // out by itself. However as of clang 7, even with the appropriate + // llvm_unreachable added just here, it is not able to do so. + unsigned Length; + switch (CharByteWidth) { + case 1: + Length = ByteLength; + break; + case 2: + Length = ByteLength / 2; + break; + case 4: + Length = ByteLength / 4; + break; + default: + llvm_unreachable("Unsupported character width!"); + } - // Allocate enough space for the StringLiteral plus an array of locations for - // any concatenated string tokens. - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = new (Mem) StringLiteral(Ty); + StringLiteralBits.Kind = Kind; + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.IsPascal = Pascal; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; - // OPTIMIZE: could allocate this appended to the StringLiteral. - SL->setString(C,Str,Kind,Pascal); + // Initialize the trailing array of SourceLocation. + // This is safe since SourceLocation is POD-like. + std::memcpy(getTrailingObjects(), Loc, + NumConcatenated * sizeof(SourceLocation)); - SL->TokLocs[0] = Loc[0]; - SL->NumConcatenated = NumStrs; + // Initialize the trailing array of char holding the string data. + std::memcpy(getTrailingObjects(), Str.data(), ByteLength); +} - if (NumStrs != 1) - memcpy(&SL->TokLocs[1], Loc+1, sizeof(SourceLocation)*(NumStrs-1)); - return SL; +StringLiteral::StringLiteral(EmptyShell Empty, unsigned NumConcatenated, + unsigned Length, unsigned CharByteWidth) + : Expr(StringLiteralClass, Empty) { + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; } -StringLiteral *StringLiteral::CreateEmpty(const ASTContext &C, - unsigned NumStrs) { - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = - new (Mem) StringLiteral(C.adjustStringLiteralBaseType(QualType())); - SL->CharByteWidth = 0; - SL->Length = 0; - SL->NumConcatenated = NumStrs; - return SL; +StringLiteral *StringLiteral::Create(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Str.size()), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(Ctx, Str, Kind, Pascal, Ty, Loc, NumConcatenated); +} + +StringLiteral *StringLiteral::CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, + unsigned Length, + unsigned CharByteWidth) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Length * CharByteWidth), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(EmptyShell(), NumConcatenated, Length, CharByteWidth); } void StringLiteral::outputString(raw_ostream &OS) const { @@ -1046,42 +1084,6 @@ void StringLiteral::outputString(raw_ostream &OS) const { OS << '"'; } -void StringLiteral::setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal) { - //FIXME: we assume that the string data comes from a target that uses the same - // code unit size and endianness for the type of string. - this->Kind = Kind; - this->IsPascal = IsPascal; - - CharByteWidth = mapCharByteWidth(C.getTargetInfo(),Kind); - assert((Str.size()%CharByteWidth == 0) - && "size of data must be multiple of CharByteWidth"); - Length = Str.size()/CharByteWidth; - - switch(CharByteWidth) { - case 1: { - char *AStrData = new (C) char[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asChar = AStrData; - break; - } - case 2: { - uint16_t *AStrData = new (C) uint16_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt16 = AStrData; - break; - } - case 4: { - uint32_t *AStrData = new (C) uint32_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt32 = AStrData; - break; - } - default: - llvm_unreachable("unsupported CharByteWidth"); - } -} - /// getLocationOfByte - Return a source location that points to the specified /// byte of this string literal. /// diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp index ef370260c5..15e89db209 100644 --- a/lib/Serialization/ASTReaderStmt.cpp +++ b/lib/Serialization/ASTReaderStmt.cpp @@ -595,22 +595,35 @@ void ASTStmtReader::VisitImaginaryLiteral(ImaginaryLiteral *E) { void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - unsigned Len = Record.readInt(); - assert(Record.peekInt() == E->getNumConcatenated() && - "Wrong number of concatenated tokens!"); - Record.skipInts(1); - auto kind = static_cast(Record.readInt()); - bool isPascal = Record.readInt(); - // Read string data - auto B = &Record.peekInt(); - SmallString<16> Str(B, B + Len); - E->setString(Record.getContext(), Str, kind, isPascal); - Record.skipInts(Len); - - // Read source locations - for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) + // NumConcatenated, Length and CharByteWidth are set by the empty + // ctor since they are needed to allocate storage for the trailing objects. + unsigned NumConcatenated = Record.readInt(); + unsigned Length = Record.readInt(); + unsigned CharByteWidth = Record.readInt(); + assert((NumConcatenated == E->getNumConcatenated()) && + "Wrong number of concatenated tokens!"); + assert((Length == E->getLength()) && "Wrong Length!"); + assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!"); + E->StringLiteralBits.Kind = Record.readInt(); + E->StringLiteralBits.IsPascal = Record.readInt(); + + // The character width is originally computed via mapCharByteWidth. + // Check that the deserialized character width is consistant with the result + // of calling mapCharByteWidth. + assert((CharByteWidth == + StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(), + E->getKind())) && + "Wrong character width!"); + + // Deserialize the trailing array of SourceLocation. + for (unsigned I = 0; I < NumConcatenated; ++I) E->setStrTokenLoc(I, ReadSourceLocation()); + + // Deserialize the trailing array of char holding the string data. + char *StrData = E->getStrDataAsChar(); + for (unsigned I = 0; I < Length * CharByteWidth; ++I) + StrData[I] = Record.readInt(); } void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { @@ -2423,8 +2436,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { break; case EXPR_STRING_LITERAL: - S = StringLiteral::CreateEmpty(Context, - Record[ASTStmtReader::NumExprFields + 1]); + S = StringLiteral::CreateEmpty( + Context, + /* NumConcatenated=*/Record[ASTStmtReader::NumExprFields + 0], + /* Length=*/Record[ASTStmtReader::NumExprFields + 1], + /* CharByteWidth=*/Record[ASTStmtReader::NumExprFields + 2]); break; case EXPR_CHARACTER_LITERAL: diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp index f32705c2ef..9f861bef89 100644 --- a/lib/Serialization/ASTWriterStmt.cpp +++ b/lib/Serialization/ASTWriterStmt.cpp @@ -518,17 +518,23 @@ void ASTStmtWriter::VisitImaginaryLiteral(ImaginaryLiteral *E) { void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - Record.push_back(E->getByteLength()); + + // Store the various bits of data of StringLiteral. Record.push_back(E->getNumConcatenated()); + Record.push_back(E->getLength()); + Record.push_back(E->getCharByteWidth()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); - // FIXME: String data should be stored as a blob at the end of the - // StringLiteral. However, we can't do so now because we have no - // provision for coping with abbreviations when we're jumping around - // the AST file during deserialization. - Record.append(E->getBytes().begin(), E->getBytes().end()); + + // Store the trailing array of SourceLocation. for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) Record.AddSourceLocation(E->getStrTokenLoc(I)); + + // Store the trailing array of char holding the string data. + StringRef StrData = E->getBytes(); + for (unsigned I = 0, N = E->getByteLength(); I != N; ++I) + Record.push_back(StrData[I]); + Code = serialization::EXPR_STRING_LITERAL; } -- 2.40.0