From 5cee1195584fa8672253139c86e922daeda69b9e Mon Sep 17 00:00:00 2001 From: Douglas Gregor Date: Wed, 27 Jul 2011 05:40:30 +0000 Subject: [PATCH] Add support for C++0x unicode string and character literals, from Craig Topper! git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@136210 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/AST/Expr.h | 52 +++++-- include/clang/AST/Type.h | 2 + include/clang/Basic/DiagnosticLexKinds.td | 6 +- include/clang/Basic/IdentifierTable.h | 7 +- include/clang/Basic/TokenKinds.def | 12 +- include/clang/Lex/Lexer.h | 6 +- include/clang/Lex/LiteralSupport.h | 31 +++-- include/clang/Lex/Token.h | 5 +- include/clang/Lex/TokenConcatenation.h | 9 +- include/clang/Parse/Parser.h | 5 +- lib/AST/ASTImporter.cpp | 4 +- lib/AST/Expr.cpp | 9 +- lib/AST/StmtDumper.cpp | 9 +- lib/AST/StmtPrinter.cpp | 18 ++- lib/AST/StmtProfile.cpp | 4 +- lib/AST/Type.cpp | 12 ++ lib/CodeGen/CodeGenModule.cpp | 16 ++- lib/Lex/Lexer.cpp | 77 +++++++++-- lib/Lex/LiteralSupport.cpp | 157 +++++++++++++--------- lib/Lex/MacroArgs.cpp | 8 +- lib/Lex/PPDirectives.cpp | 4 +- lib/Lex/PPExpressions.cpp | 16 ++- lib/Lex/Pragma.cpp | 6 +- lib/Lex/TokenConcatenation.cpp | 64 ++++----- lib/Parse/ParseCXXInlineMethods.cpp | 3 + lib/Parse/ParseExpr.cpp | 6 + lib/Parse/ParseTentative.cpp | 6 + lib/Parse/Parser.cpp | 3 + lib/Rewrite/HTMLRewrite.cpp | 9 +- lib/Rewrite/RewriteObjC.cpp | 19 +-- lib/Sema/SemaChecking.cpp | 4 +- lib/Sema/SemaDeclAttr.cpp | 12 +- lib/Sema/SemaExpr.cpp | 39 +++++- lib/Sema/SemaExprCXX.cpp | 20 ++- lib/Sema/SemaExprObjC.cpp | 6 +- lib/Sema/SemaInit.cpp | 32 +++-- lib/Sema/SemaStmt.cpp | 8 +- lib/Sema/SemaTemplate.cpp | 18 ++- lib/Serialization/ASTReaderStmt.cpp | 4 +- lib/Serialization/ASTWriterStmt.cpp | 4 +- test/CXX/lex/lex.literal/lex.ccon/p1.cpp | 7 +- test/CodeGen/char-literal.c | 44 +++++- test/CodeGen/string-literal.c | 23 +++- test/Lexer/wchar.c | 4 +- test/Parser/char-literal-printing.c | 37 ++++- test/SemaCXX/type-convert-construct.cpp | 7 +- 46 files changed, 608 insertions(+), 246 deletions(-) diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h index f623fd1d52..9e4c0f0915 100644 --- a/include/clang/AST/Expr.h +++ b/include/clang/AST/Expr.h @@ -1112,29 +1112,39 @@ public: }; class CharacterLiteral : public Expr { +public: + enum CharacterKind { + Ascii, + Wide, + UTF16, + UTF32 + }; + +private: unsigned Value; SourceLocation Loc; - bool IsWide; + unsigned Kind : 2; public: // type should be IntTy - CharacterLiteral(unsigned value, bool iswide, QualType type, SourceLocation l) + CharacterLiteral(unsigned value, CharacterKind kind, QualType type, + SourceLocation l) : Expr(CharacterLiteralClass, type, VK_RValue, OK_Ordinary, false, false, false, false), - Value(value), Loc(l), IsWide(iswide) { + Value(value), Loc(l), Kind(kind) { } /// \brief Construct an empty character literal. CharacterLiteral(EmptyShell Empty) : Expr(CharacterLiteralClass, Empty) { } SourceLocation getLocation() const { return Loc; } - bool isWide() const { return IsWide; } + CharacterKind getKind() const { return static_cast(Kind); } SourceRange getSourceRange() const { return SourceRange(Loc); } unsigned getValue() const { return Value; } void setLocation(SourceLocation Location) { Loc = Location; } - void setWide(bool W) { IsWide = W; } + void setKind(CharacterKind kind) { Kind = kind; } void setValue(unsigned Val) { Value = Val; } static bool classof(const Stmt *T) { @@ -1243,13 +1253,23 @@ public: /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". class StringLiteral : public Expr { +public: + enum StringKind { + Ascii, + Wide, + UTF8, + UTF16, + UTF32 + }; + +private: friend class ASTStmtReader; const char *StrData; unsigned ByteLength; - bool IsWide; - bool IsPascal; unsigned NumConcatenated; + unsigned Kind : 3; + bool IsPascal : 1; SourceLocation TokLocs[1]; StringLiteral(QualType Ty) : @@ -1259,14 +1279,15 @@ class StringLiteral : public Expr { public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide, + static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, const SourceLocation *Loc, unsigned NumStrs); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide, - bool Pascal, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Wide, Pascal, Ty, &Loc, 1); + static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind, + bool Pascal, QualType Ty, + SourceLocation Loc) { + return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); } /// \brief Construct an empty string literal. @@ -1281,9 +1302,14 @@ public: /// \brief Sets the string data to the given string data. void setString(ASTContext &C, StringRef Str); - bool isWide() const { return IsWide; } + StringKind getKind() const { return static_cast(Kind); } + bool isAscii() const { return Kind == Ascii; } + bool isWide() const { return Kind == Wide; } + bool isUTF8() const { return Kind == UTF8; } + bool isUTF16() const { return Kind == UTF16; } + bool isUTF32() const { return Kind == UTF32; } bool isPascal() const { return IsPascal; } - + bool containsNonAsciiOrNull() const { StringRef Str = getString(); for (unsigned i = 0, e = Str.size(); i != e; ++i) diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h index 8a842da440..2b72610226 100644 --- a/include/clang/AST/Type.h +++ b/include/clang/AST/Type.h @@ -1368,6 +1368,8 @@ public: bool isBooleanType() const; bool isCharType() const; bool isWideCharType() const; + bool isChar16Type() const; + bool isChar32Type() const; bool isAnyCharacterType() const; bool isIntegralType(ASTContext &Ctx) const; diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 9e431a2d21..e23921be0b 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -77,8 +77,8 @@ def err_invalid_suffix_integer_constant : Error< "invalid suffix '%0' on integer constant">; def err_invalid_suffix_float_constant : Error< "invalid suffix '%0' on floating constant">; -def warn_extraneous_wide_char_constant : Warning< - "extraneous characters in wide character constant ignored">; +def warn_extraneous_char_constant : Warning< + "extraneous characters in character constant ignored">; def warn_char_constant_too_large : Warning< "character constant too long for its type">; def err_exponent_has_no_digits : Error<"exponent has no digits">; @@ -102,6 +102,8 @@ def warn_ucn_escape_too_large : ExtWarn< "character unicode escape sequence too long for its type">; def warn_ucn_not_valid_in_c89 : ExtWarn< "unicode escape sequences are only valid in C99 or C++">; +def err_unsupported_string_concat : Error< + "unsupported non-standard concatenation of string literals">; //===----------------------------------------------------------------------===// // PTH Diagnostics diff --git a/include/clang/Basic/IdentifierTable.h b/include/clang/Basic/IdentifierTable.h index be1fa196c0..3390f7809d 100644 --- a/include/clang/Basic/IdentifierTable.h +++ b/include/clang/Basic/IdentifierTable.h @@ -50,8 +50,8 @@ namespace clang { /// set, and all tok::identifier tokens have a pointer to one of these. class IdentifierInfo { // Note: DON'T make TokenID a 'tok::TokenKind'; MSVC will treat it as a - // signed char and TokenKinds > 127 won't be handled correctly. - unsigned TokenID : 8; // Front-end token ID or tok::identifier. + // signed char and TokenKinds > 255 won't be handled correctly. + unsigned TokenID : 9; // Front-end token ID or tok::identifier. // Objective-C keyword ('protocol' in '@protocol') or builtin (__builtin_inf). // First NUM_OBJC_KEYWORDS values are for Objective-C, the remaining values // are for builtins. @@ -65,7 +65,7 @@ class IdentifierInfo { // file and wasn't modified since. bool RevertedTokenID : 1; // True if RevertTokenIDToIdentifier was // called. - // 6 bits left in 32-bit word. + // 5 bits left in 32-bit word. void *FETokenInfo; // Managed by the language front-end. llvm::StringMapEntry *Entry; @@ -409,6 +409,7 @@ public: IdentifierInfo &get(StringRef Name, tok::TokenKind TokenCode) { IdentifierInfo &II = get(Name); II.TokenID = TokenCode; + assert(II.TokenID == TokenCode && "TokenCode too large"); return II; } diff --git a/include/clang/Basic/TokenKinds.def b/include/clang/Basic/TokenKinds.def index 86172b83ff..d057559889 100644 --- a/include/clang/Basic/TokenKinds.def +++ b/include/clang/Basic/TokenKinds.def @@ -114,13 +114,23 @@ TOK(raw_identifier) // Used only in raw lexing mode. TOK(numeric_constant) // 0x123 // C99 6.4.4: Character Constants -TOK(char_constant) // 'a' L'b' +TOK(char_constant) // 'a' +TOK(wide_char_constant) // L'b' + +// C++0x Character Constants +TOK(utf16_char_constant) // u'a' +TOK(utf32_char_constant) // U'a' // C99 6.4.5: String Literals. TOK(string_literal) // "foo" TOK(wide_string_literal) // L"foo" TOK(angle_string_literal)// +// C++0x String Literals. +TOK(utf8_string_literal) // u8"foo" +TOK(utf16_string_literal)// u"foo" +TOK(utf32_string_literal)// U"foo" + // C99 6.4.6: Punctuators. PUNCTUATOR(l_square, "[") PUNCTUATOR(r_square, "]") diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index 990c1eedbb..2c25597433 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -471,9 +471,11 @@ private: // Helper functions to lex the remainder of a token of the specific type. void LexIdentifier (Token &Result, const char *CurPtr); void LexNumericConstant (Token &Result, const char *CurPtr); - void LexStringLiteral (Token &Result, const char *CurPtr,bool Wide); + void LexStringLiteral (Token &Result, const char *CurPtr, + tok::TokenKind Kind); void LexAngledStringLiteral(Token &Result, const char *CurPtr); - void LexCharConstant (Token &Result, const char *CurPtr); + void LexCharConstant (Token &Result, const char *CurPtr, + tok::TokenKind Kind); bool LexEndOfFile (Token &Result, const char *CurPtr); bool SkipWhitespace (Token &Result, const char *CurPtr); diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index 6486c38a40..15057299b2 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -19,6 +19,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/DataTypes.h" +#include "clang/Basic/TokenKinds.h" #include namespace clang { @@ -124,15 +125,19 @@ private: /// character literal. class CharLiteralParser { uint64_t Value; - bool IsWide; + tok::TokenKind Kind; bool IsMultiChar; bool HadError; public: CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP); + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind); bool hadError() const { return HadError; } - bool isWide() const { return IsWide; } + bool isAscii() const { return Kind == tok::char_constant; } + bool isWide() const { return Kind == tok::wide_char_constant; } + bool isUTF16() const { return Kind == tok::utf16_char_constant; } + bool isUTF32() const { return Kind == tok::utf32_char_constant; } bool isMultiChar() const { return IsMultiChar; } uint64_t getValue() const { return Value; } }; @@ -148,7 +153,8 @@ class StringLiteralParser { unsigned MaxTokenLength; unsigned SizeBound; - unsigned wchar_tByteWidth; + unsigned CharByteWidth; + tok::TokenKind Kind; llvm::SmallString<512> ResultBuf; char *ResultPtr; // cursor public: @@ -158,14 +164,13 @@ public: const SourceManager &sm, const LangOptions &features, const TargetInfo &target, Diagnostic *diags = 0) : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), - ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks, NumStringToks); } bool hadError; - bool AnyWide; bool Pascal; StringRef GetString() const { @@ -174,9 +179,7 @@ public: unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); } unsigned GetNumStringChars() const { - if (AnyWide) - return GetStringLength() / wchar_tByteWidth; - return GetStringLength(); + return GetStringLength() / CharByteWidth; } /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles @@ -185,7 +188,13 @@ public: /// If the Diagnostics pointer is non-null, then this will do semantic /// checking of the string literal and emit errors and warnings. unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const; - + + bool isAscii() { return Kind == tok::string_literal; } + bool isWide() { return Kind == tok::wide_string_literal; } + bool isUTF8() { return Kind == tok::utf8_string_literal; } + bool isUTF16() { return Kind == tok::utf16_string_literal; } + bool isUTF32() { return Kind == tok::utf32_string_literal; } + private: void init(const Token *StringToks, unsigned NumStringToks); }; diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h index 9cf11d9a64..e6dd1607e8 100644 --- a/include/clang/Lex/Token.h +++ b/include/clang/Lex/Token.h @@ -96,7 +96,10 @@ public: /// constant, string, etc. bool isLiteral() const { return is(tok::numeric_constant) || is(tok::char_constant) || - is(tok::string_literal) || is(tok::wide_string_literal) || + is(tok::wide_char_constant) || is(tok::utf16_char_constant) || + is(tok::utf32_char_constant) || is(tok::string_literal) || + is(tok::wide_string_literal) || is(tok::utf8_string_literal) || + is(tok::utf16_string_literal) || is(tok::utf32_string_literal) || is(tok::angle_string_literal); } diff --git a/include/clang/Lex/TokenConcatenation.h b/include/clang/Lex/TokenConcatenation.h index 094990a6e3..551300f402 100644 --- a/include/clang/Lex/TokenConcatenation.h +++ b/include/clang/Lex/TokenConcatenation.h @@ -63,12 +63,9 @@ namespace clang { const Token &Tok) const; private: - /// StartsWithL - Return true if the spelling of this token starts with 'L'. - bool StartsWithL(const Token &Tok) const; - - /// IsIdentifierL - Return true if the spelling of this token is literally - /// 'L'. - bool IsIdentifierL(const Token &Tok) const; + /// IsIdentifierStringPrefix - Return true if the spelling of the token + /// is literally 'L', 'u', 'U', or 'u8'. + bool IsIdentifierStringPrefix(const Token &Tok) const; }; } // end clang namespace diff --git a/include/clang/Parse/Parser.h b/include/clang/Parse/Parser.h index 5d9376c1f7..83b0cd455e 100644 --- a/include/clang/Parse/Parser.h +++ b/include/clang/Parse/Parser.h @@ -265,7 +265,10 @@ private: /// bool isTokenStringLiteral() const { return Tok.getKind() == tok::string_literal || - Tok.getKind() == tok::wide_string_literal; + Tok.getKind() == tok::wide_string_literal || + Tok.getKind() == tok::utf8_string_literal || + Tok.getKind() == tok::utf16_string_literal || + Tok.getKind() == tok::utf32_string_literal; } /// \brief Returns true if the current token is a '=' or '==' and diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp index 2ea79912d1..d6e7d77d0f 100644 --- a/lib/AST/ASTImporter.cpp +++ b/lib/AST/ASTImporter.cpp @@ -3814,8 +3814,8 @@ Expr *ASTNodeImporter::VisitCharacterLiteral(CharacterLiteral *E) { if (T.isNull()) return 0; - return new (Importer.getToContext()) CharacterLiteral(E->getValue(), - E->isWide(), T, + return new (Importer.getToContext()) CharacterLiteral(E->getValue(), + E->getKind(), T, Importer.Import(E->getLocation())); } diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp index 58fb32d278..5e795be56d 100644 --- a/lib/AST/Expr.cpp +++ b/lib/AST/Expr.cpp @@ -533,8 +533,7 @@ double FloatingLiteral::getValueAsApproximateDouble() const { } StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str, - bool Wide, - bool Pascal, QualType Ty, + StringKind Kind, bool Pascal, QualType Ty, const SourceLocation *Loc, unsigned NumStrs) { // Allocate enough space for the StringLiteral plus an array of locations for @@ -549,7 +548,7 @@ StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str, memcpy(AStrData, Str.data(), Str.size()); SL->StrData = AStrData; SL->ByteLength = Str.size(); - SL->IsWide = Wide; + SL->Kind = Kind; SL->IsPascal = Pascal; SL->TokLocs[0] = Loc[0]; SL->NumConcatenated = NumStrs; @@ -587,8 +586,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str) { SourceLocation StringLiteral:: getLocationOfByte(unsigned ByteNo, const SourceManager &SM, const LangOptions &Features, const TargetInfo &Target) const { - assert(!isWide() && "This doesn't work for wide strings yet"); - + assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings"); + // Loop over all of the tokens in this string until we find the one that // contains the byte we're looking for. unsigned TokNo = 0; diff --git a/lib/AST/StmtDumper.cpp b/lib/AST/StmtDumper.cpp index 7218af570f..ce4ae8e773 100644 --- a/lib/AST/StmtDumper.cpp +++ b/lib/AST/StmtDumper.cpp @@ -443,8 +443,13 @@ void StmtDumper::VisitStringLiteral(StringLiteral *Str) { DumpExpr(Str); // FIXME: this doesn't print wstrings right. OS << " "; - if (Str->isWide()) - OS << "L"; + switch (Str->getKind()) { + case StringLiteral::Ascii: break; // No prefix + case StringLiteral::Wide: OS << 'L'; break; + case StringLiteral::UTF8: OS << "u8"; break; + case StringLiteral::UTF16: OS << 'u'; break; + case StringLiteral::UTF32: OS << 'U'; break; + } OS << '"'; OS.write_escaped(Str->getString()); OS << '"'; diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp index 8fcad14ec2..79f14bc658 100644 --- a/lib/AST/StmtPrinter.cpp +++ b/lib/AST/StmtPrinter.cpp @@ -599,8 +599,14 @@ void StmtPrinter::VisitPredefinedExpr(PredefinedExpr *Node) { void StmtPrinter::VisitCharacterLiteral(CharacterLiteral *Node) { unsigned value = Node->getValue(); - if (Node->isWide()) - OS << "L"; + + switch (Node->getKind()) { + case CharacterLiteral::Ascii: break; // no prefix. + case CharacterLiteral::Wide: OS << 'L'; break; + case CharacterLiteral::UTF16: OS << 'u'; break; + case CharacterLiteral::UTF32: OS << 'U'; break; + } + switch (value) { case '\\': OS << "'\\\\'"; @@ -672,7 +678,13 @@ void StmtPrinter::VisitImaginaryLiteral(ImaginaryLiteral *Node) { } void StmtPrinter::VisitStringLiteral(StringLiteral *Str) { - if (Str->isWide()) OS << 'L'; + switch (Str->getKind()) { + case StringLiteral::Ascii: break; // no prefix. + case StringLiteral::Wide: OS << 'L'; break; + case StringLiteral::UTF8: OS << "u8"; break; + case StringLiteral::UTF16: OS << 'u'; break; + case StringLiteral::UTF32: OS << 'U'; break; + } OS << '"'; // FIXME: this doesn't print wstrings right. diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp index 120c9e50a9..12321ef0d6 100644 --- a/lib/AST/StmtProfile.cpp +++ b/lib/AST/StmtProfile.cpp @@ -252,7 +252,7 @@ void StmtProfiler::VisitIntegerLiteral(const IntegerLiteral *S) { void StmtProfiler::VisitCharacterLiteral(const CharacterLiteral *S) { VisitExpr(S); - ID.AddBoolean(S->isWide()); + ID.AddInteger(S->getKind()); ID.AddInteger(S->getValue()); } @@ -269,7 +269,7 @@ void StmtProfiler::VisitImaginaryLiteral(const ImaginaryLiteral *S) { void StmtProfiler::VisitStringLiteral(const StringLiteral *S) { VisitExpr(S); ID.AddString(S->getString()); - ID.AddBoolean(S->isWide()); + ID.AddInteger(S->getKind()); } void StmtProfiler::VisitParenExpr(const ParenExpr *S) { diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp index 7cd3be2fb4..2555ab31fb 100644 --- a/lib/AST/Type.cpp +++ b/lib/AST/Type.cpp @@ -635,6 +635,18 @@ bool Type::isWideCharType() const { return false; } +bool Type::isChar16Type() const { + if (const BuiltinType *BT = dyn_cast(CanonicalType)) + return BT->getKind() == BuiltinType::Char16; + return false; +} + +bool Type::isChar32Type() const { + if (const BuiltinType *BT = dyn_cast(CanonicalType)) + return BT->getKind() == BuiltinType::Char32; + return false; +} + /// \brief Determine whether this type is any of the built-in character /// types. bool Type::isAnyCharacterType() const { diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp index 290fe242c9..ce32325aca 100644 --- a/lib/CodeGen/CodeGenModule.cpp +++ b/lib/CodeGen/CodeGenModule.cpp @@ -1877,8 +1877,20 @@ std::string CodeGenModule::GetStringForStringLiteral(const StringLiteral *E) { // Resize the string to the right size. uint64_t RealLen = CAT->getSize().getZExtValue(); - if (E->isWide()) + switch (E->getKind()) { + case StringLiteral::Ascii: + case StringLiteral::UTF8: + break; + case StringLiteral::Wide: RealLen *= Context.Target.getWCharWidth() / Context.getCharWidth(); + break; + case StringLiteral::UTF16: + RealLen *= Context.Target.getChar16Width() / Context.getCharWidth(); + break; + case StringLiteral::UTF32: + RealLen *= Context.Target.getChar32Width() / Context.getCharWidth(); + break; + } std::string Str = E->getString().str(); Str.resize(RealLen, '\0'); @@ -1893,7 +1905,7 @@ CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S) { // FIXME: This can be more efficient. // FIXME: We shouldn't need to bitcast the constant in the wide string case. llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S)); - if (S->isWide()) { + if (S->isWide() || S->isUTF16() || S->isUTF32()) { llvm::Type *DestTy = llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType())); C = llvm::ConstantExpr::getBitCast(C, DestTy); diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 6c7169f89b..44674a93d7 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1267,8 +1267,9 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { } /// LexStringLiteral - Lex the remainder of a string literal, after having lexed -/// either " or L". -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { +/// either " or L" or u8" or u" or U". +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this string contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -1299,8 +1300,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { // Update the location of the token as well as the BufferPtr instance var. const char *TokStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, - Wide ? tok::wide_string_literal : tok::string_literal); + FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); } @@ -1339,8 +1339,9 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { /// LexCharConstant - Lex the remainder of a character constant, after having -/// lexed either ' or L'. -void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { +/// lexed either ' or L' or u' or U'. +void Lexer::LexCharConstant(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this character contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -1377,7 +1378,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, tok::char_constant); + FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); } @@ -2185,6 +2186,55 @@ LexNextToken: MIOpt.ReadToken(); return LexNumericConstant(Result, CurPtr); + case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (Features.CPlusPlus0x) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-16 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_string_literal); + + // UTF-16 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_char_constant); + + // UTF-8 string literal + if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_string_literal); + } + + // treat u like the start of an identifier. + return LexIdentifier(Result, CurPtr); + + case 'U': // Identifier (Uber) or C++0x UTF-32 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (Features.CPlusPlus0x) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-32 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_string_literal); + + // UTF-32 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_char_constant); + } + + // treat U like the start of an identifier. + return LexIdentifier(Result, CurPtr); + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); @@ -2193,21 +2243,22 @@ LexNextToken: // Wide string literal. if (Char == '"') return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), - true); + tok::wide_string_literal); // Wide character constant. if (Char == '\'') - return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::wide_char_constant); // FALL THROUGH, treating L like the start of an identifier. // C99 6.4.2: Identifiers. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': - case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': /*'U'*/ case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': - case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ case 'v': case 'w': case 'x': case 'y': case 'z': case '_': // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -2230,13 +2281,13 @@ LexNextToken: case '\'': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexCharConstant(Result, CurPtr); + return LexCharConstant(Result, CurPtr, tok::char_constant); // C99 6.4.5: String Literals. case '"': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexStringLiteral(Result, CurPtr, false); + return LexStringLiteral(Result, CurPtr, tok::string_literal); // C99 6.4.6: Punctuators. case '?': diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index f8a2a55117..82493408e6 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -28,12 +28,31 @@ static int HexDigitValue(char C) { return -1; } +static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { + switch (kind) { + default: assert(0 && "Unknown token type!"); + case tok::char_constant: + case tok::string_literal: + case tok::utf8_string_literal: + return Target.getCharWidth(); + case tok::wide_char_constant: + case tok::wide_string_literal: + return Target.getWCharWidth(); + case tok::utf16_char_constant: + case tok::utf16_string_literal: + return Target.getChar16Width(); + case tok::utf32_char_constant: + case tok::utf32_string_literal: + return Target.getChar32Width(); + } +} + /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, - FullSourceLoc Loc, bool IsWide, - Diagnostic *Diags, const TargetInfo &Target) { + FullSourceLoc Loc, unsigned CharWidth, + Diagnostic *Diags) { // Skip the '\' char. ++ThisTokBuf; @@ -98,9 +117,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, } // See if any bits will be truncated when evaluated as a character. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; ResultChar &= ~0U >> (32-CharWidth); @@ -128,9 +144,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); // Check for overflow. Reject '\777', but not L'\777'. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { if (Diags) Diags->Report(Loc, diag::warn_octal_escape_too_large); @@ -219,8 +232,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, char *&ResultBuf, bool &HadError, - FullSourceLoc Loc, bool wide, Diagnostic *Diags, - const LangOptions &Features) { + FullSourceLoc Loc, unsigned CharByteWidth, + Diagnostic *Diags, const LangOptions &Features) { typedef uint32_t UTF32; UTF32 UcnVal = 0; unsigned short UcnLen = 0; @@ -230,19 +243,22 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, return; } - if (wide) { - (void)UcnLen; - assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) && + "only character widths of 1, 2, or 4 bytes supported"); - if (!Features.ShortWChar) { - // Note: our internal rep of wide char tokens is always little-endian. - *ResultBuf++ = (UcnVal & 0x000000FF); - *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; - *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; - *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; - return; - } + (void)UcnLen; + assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + + if (CharByteWidth == 4) { + // Note: our internal rep of wide char tokens is always little-endian. + *ResultBuf++ = (UcnVal & 0x000000FF); + *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; + *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; + *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; + return; + } + if (CharByteWidth == 2) { // Convert to UTF16. if (UcnVal < (UTF32)0xFFFF) { *ResultBuf++ = (UcnVal & 0x000000FF); @@ -261,6 +277,9 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; return; } + + assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); + // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. // The conversion below was inspired by: // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c @@ -695,13 +714,18 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { CharLiteralParser::CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP) { + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind) { // At this point we know that the character matches the regex "L?'.*'". HadError = false; - // Determine if this is a wide character. - IsWide = begin[0] == 'L'; - if (IsWide) ++begin; + Kind = kind; + + // Determine if this is a wide or UTF character. + if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant || + Kind == tok::utf32_char_constant) { + ++begin; + } // Skip over the entry quote. assert(begin[0] == '\'' && "Invalid token lexed"); @@ -742,17 +766,17 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ResultChar = utf32; } else { // Otherwise, this is a non-UCN escape character. Process it. + unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); ResultChar = ProcessCharEscape(begin, end, HadError, FullSourceLoc(Loc,PP.getSourceManager()), - IsWide, - &PP.getDiagnostics(), PP.getTargetInfo()); + CharWidth, &PP.getDiagnostics()); } } // If this is a multi-character constant (e.g. 'abc'), handle it. These are // implementation defined (C99 6.4.4.4p10). if (NumCharsSoFar) { - if (IsWide) { + if (!isAscii()) { // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. LitVal = 0; } else { @@ -774,8 +798,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, if (NumCharsSoFar > 1) { // Warn about discarding the top bits for multi-char wide-character // constants (L'abcd'). - if (IsWide) - PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); + if (!isAscii()) + PP.Diag(Loc, diag::warn_extraneous_char_constant); else if (NumCharsSoFar != 4) PP.Diag(Loc, diag::ext_multichar_character_literal); else @@ -787,14 +811,15 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Transfer the value from APInt to uint64_t Value = LitVal.getZExtValue(); - if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF) + if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) && + Value > 0xFFFF) PP.Diag(Loc, diag::warn_ucn_escape_too_large); // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. - if (!IsWide && NumCharsSoFar == 1 && (Value & 128) && + if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && PP.getLangOptions().CharIsSigned) Value = (signed char)Value; } @@ -839,8 +864,8 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks, Preprocessor &PP, bool Complain) : SM(PP.getSourceManager()), Features(PP.getLangOptions()), Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0), - MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), - ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks, NumStringToks); } @@ -860,7 +885,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ MaxTokenLength = StringToks[0].getLength(); assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); SizeBound = StringToks[0].getLength()-2; // -2 for "". - AnyWide = StringToks[0].is(tok::wide_string_literal); + Kind = StringToks[0].getKind(); hadError = false; @@ -881,8 +906,18 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (StringToks[i].getLength() > MaxTokenLength) MaxTokenLength = StringToks[i].getLength(); - // Remember if we see any wide strings. - AnyWide |= StringToks[i].is(tok::wide_string_literal); + // Remember if we see any wide or utf-8/16/32 strings. + // Also check for illegal concatenations. + if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { + if (isAscii()) { + Kind = StringToks[i].getKind(); + } else { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_unsupported_string_concat); + hadError = true; + } + } } // Include space for the null terminator. @@ -890,19 +925,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // TODO: K&R warning: "traditional C rejects string constant concatenation" - // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not - // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. - wchar_tByteWidth = ~0U; - if (AnyWide) { - wchar_tByteWidth = Target.getWCharWidth(); - assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); - wchar_tByteWidth /= 8; - } + // Get the width in bytes of char/wchar_t/char16_t/char32_t + CharByteWidth = getCharWidth(Kind, Target); + assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); + CharByteWidth /= 8; // The output buffer size needs to be large enough to hold wide characters. // This is a worst-case assumption which basically corresponds to L"" "long". - if (AnyWide) - SizeBound *= wchar_tByteWidth; + SizeBound *= CharByteWidth; // Size the temporary buffer to hold the result string data. ResultBuf.resize(SizeBound); @@ -927,18 +957,19 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, &StringInvalid); if (StringInvalid) { - hadError = 1; + hadError = true; continue; } const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. - bool wide = false; // TODO: Input character set mapping support. // Skip L marker for wide strings. - if (ThisTokBuf[0] == 'L') { - wide = true; + if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { ++ThisTokBuf; + // Skip 8 of u8 marker for utf8 strings. + if (ThisTokBuf[0] == '8') + ++ThisTokBuf; } assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); @@ -967,7 +998,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // Copy the character span over. unsigned Len = ThisTokBuf-InStart; - if (!AnyWide) { + if (CharByteWidth == 1) { memcpy(ResultPtr, InStart, Len); ResultPtr += Len; } else { @@ -975,7 +1006,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ for (; Len; --Len, ++InStart) { *ResultPtr++ = InStart[0]; // Add zeros at the end. - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) *ResultPtr++ = 0; } } @@ -985,29 +1016,26 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, hadError, FullSourceLoc(StringToks[i].getLocation(),SM), - wide, Diags, Features); + CharByteWidth, Diags, Features); continue; } // Otherwise, this is a non-UCN escape character. Process it. unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), - AnyWide, Diags, Target); + CharByteWidth*8, Diags); // Note: our internal rep of wide char tokens is always little-endian. *ResultPtr++ = ResultChar & 0xFF; - if (AnyWide) { - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) - *ResultPtr++ = ResultChar >> i*8; - } + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) + *ResultPtr++ = ResultChar >> i*8; } } if (Pascal) { ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; - if (AnyWide) - ResultBuf[0] /= wchar_tByteWidth; + ResultBuf[0] /= CharByteWidth; // Verify that pascal strings aren't too large. if (GetStringLength() > 256) { @@ -1016,7 +1044,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ diag::err_pascal_string_too_long) << SourceRange(StringToks[0].getLocation(), StringToks[NumStringToks-1].getLocation()); - hadError = 1; + hadError = true; return; } } else if (Diags) { @@ -1050,7 +1078,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, if (StringInvalid) return 0; - assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet"); + assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && + SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); const char *SpellingStart = SpellingPtr; @@ -1075,7 +1104,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, bool HadError = false; ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), - false, Diags, Target); + CharByteWidth*8, Diags); assert(!HadError && "This method isn't valid on erroneous strings"); --ByteNo; } diff --git a/lib/Lex/MacroArgs.cpp b/lib/Lex/MacroArgs.cpp index 968c15e3c2..ccd0b705c8 100644 --- a/lib/Lex/MacroArgs.cpp +++ b/lib/Lex/MacroArgs.cpp @@ -208,7 +208,13 @@ Token MacroArgs::StringifyArgument(const Token *ArgToks, // by 6.10.3.2p2. if (Tok.is(tok::string_literal) || // "foo" Tok.is(tok::wide_string_literal) || // L"foo" - Tok.is(tok::char_constant)) { // 'x' and L'x'. + Tok.is(tok::utf8_string_literal) || // u8"foo" + Tok.is(tok::utf16_string_literal) || // u"foo" + Tok.is(tok::utf32_string_literal) || // U"foo" + Tok.is(tok::char_constant) || // 'x' + Tok.is(tok::wide_char_constant) || // L'x'. + Tok.is(tok::utf16_char_constant) || // u'x'. + Tok.is(tok::utf32_char_constant)) { // U'x'. bool Invalid = false; std::string TokStr = PP.getSpelling(Tok, &Invalid); if (!Invalid) { diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp index 212ffeef1b..383c6f5aa1 100644 --- a/lib/Lex/PPDirectives.cpp +++ b/lib/Lex/PPDirectives.cpp @@ -777,7 +777,7 @@ void Preprocessor::HandleLineDirective(Token &Tok) { } else { // Parse and validate the string, converting it into a unique ID. StringLiteralParser Literal(&StrTok, 1, *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return DiscardUntilEndOfDirective(); if (Literal.Pascal) { @@ -910,7 +910,7 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) { } else { // Parse and validate the string, converting it into a unique ID. StringLiteralParser Literal(&StrTok, 1, *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return DiscardUntilEndOfDirective(); if (Literal.Pascal) { diff --git a/lib/Lex/PPExpressions.cpp b/lib/Lex/PPExpressions.cpp index 08e2705ef1..25816923c8 100644 --- a/lib/Lex/PPExpressions.cpp +++ b/lib/Lex/PPExpressions.cpp @@ -236,7 +236,10 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, PP.LexNonComment(PeekTok); return false; } - case tok::char_constant: { // 'x' + case tok::char_constant: // 'x' + case tok::wide_char_constant: { // L'x' + case tok::utf16_char_constant: // u'x' + case tok::utf32_char_constant: // U'x' llvm::SmallString<32> CharBuffer; bool CharInvalid = false; StringRef ThisTok = PP.getSpelling(PeekTok, CharBuffer, &CharInvalid); @@ -244,7 +247,7 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, return true; CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), - PeekTok.getLocation(), PP); + PeekTok.getLocation(), PP, PeekTok.getKind()); if (Literal.hadError()) return true; // A diagnostic was already emitted. @@ -255,6 +258,10 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, NumBits = TI.getIntWidth(); else if (Literal.isWide()) NumBits = TI.getWCharWidth(); + else if (Literal.isUTF16()) + NumBits = TI.getChar16Width(); + else if (Literal.isUTF32()) + NumBits = TI.getChar32Width(); else NumBits = TI.getCharWidth(); @@ -262,8 +269,9 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, llvm::APSInt Val(NumBits); // Set the value. Val = Literal.getValue(); - // Set the signedness. - Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned); + // Set the signedness. UTF-16 and UTF-32 are always unsigned + if (!Literal.isUTF16() && !Literal.isUTF32()) + Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned); if (Result.Val.getBitWidth() > Val.getBitWidth()) { Result.Val = Val.extend(Result.Val.getBitWidth()); diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp index d94e2e8305..1d0b5e4f2d 100644 --- a/lib/Lex/Pragma.cpp +++ b/lib/Lex/Pragma.cpp @@ -444,7 +444,7 @@ void Preprocessor::HandlePragmaComment(Token &Tok) { // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { @@ -520,7 +520,7 @@ void Preprocessor::HandlePragmaMessage(Token &Tok) { // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { @@ -902,7 +902,7 @@ public: // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), PP); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { diff --git a/lib/Lex/TokenConcatenation.cpp b/lib/Lex/TokenConcatenation.cpp index 3e9e855031..19baf80aad 100644 --- a/lib/Lex/TokenConcatenation.cpp +++ b/lib/Lex/TokenConcatenation.cpp @@ -17,42 +17,39 @@ using namespace clang; -/// StartsWithL - Return true if the spelling of this token starts with 'L'. -bool TokenConcatenation::StartsWithL(const Token &Tok) const { - if (!Tok.needsCleaning()) { - SourceManager &SM = PP.getSourceManager(); - return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; - } - - if (Tok.getLength() < 256) { - char Buffer[256]; - const char *TokPtr = Buffer; - PP.getSpelling(Tok, TokPtr); - return TokPtr[0] == 'L'; - } - - return PP.getSpelling(Tok)[0] == 'L'; -} +/// IsIdentifierStringPrefix - Return true if the spelling of the token +/// is literally 'L', 'u', 'U', or 'u8'. +bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { + const LangOptions &LangOpts = PP.getLangOptions(); -/// IsIdentifierL - Return true if the spelling of this token is literally -/// 'L'. -bool TokenConcatenation::IsIdentifierL(const Token &Tok) const { if (!Tok.needsCleaning()) { - if (Tok.getLength() != 1) + if (Tok.getLength() != 1 && Tok.getLength() != 2) return false; SourceManager &SM = PP.getSourceManager(); - return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; + const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); + if (Tok.getLength() == 1) + return Ptr[0] == 'L' || + (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U')); + if (Tok.getLength() == 2) + return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8'; } if (Tok.getLength() < 256) { char Buffer[256]; const char *TokPtr = Buffer; - if (PP.getSpelling(Tok, TokPtr) != 1) - return false; - return TokPtr[0] == 'L'; + unsigned length = PP.getSpelling(Tok, TokPtr); + if (length == 1) + return TokPtr[0] == 'L' || + (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U')); + if (length == 2) + return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8'; + return false; } - return PP.getSpelling(Tok) == "L"; + std::string TokStr = PP.getSpelling(Tok); + return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" || + TokStr == "u" || + TokStr == "U")); } TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { @@ -179,24 +176,19 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, if (Tok.is(tok::numeric_constant)) return GetFirstChar(PP, Tok) != '.'; - if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) /* || - Tok.is(tok::wide_char_literal)*/) + if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || + Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || + Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || + Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant)) return true; // If this isn't identifier + string, we're done. if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) return false; - // FIXME: need a wide_char_constant! - - // If the string was a wide string L"foo" or wide char L'f', it would - // concat with the previous identifier into fooL"bar". Avoid this. - if (StartsWithL(Tok)) - return true; - // Otherwise, this is a narrow character or string. If the *identifier* - // is a literal 'L', avoid pasting L "foo" -> L"foo". - return IsIdentifierL(PrevTok); + // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". + return IsIdentifierStringPrefix(PrevTok); case tok::numeric_constant: return isalnum(FirstChar) || Tok.is(tok::numeric_constant) || FirstChar == '+' || FirstChar == '-' || FirstChar == '.'; diff --git a/lib/Parse/ParseCXXInlineMethods.cpp b/lib/Parse/ParseCXXInlineMethods.cpp index f5c69981ca..e16448080c 100644 --- a/lib/Parse/ParseCXXInlineMethods.cpp +++ b/lib/Parse/ParseCXXInlineMethods.cpp @@ -553,6 +553,9 @@ bool Parser::ConsumeAndStoreUntil(tok::TokenKind T1, tok::TokenKind T2, case tok::string_literal: case tok::wide_string_literal: + case tok::utf8_string_literal: + case tok::utf16_string_literal: + case tok::utf32_string_literal: Toks.push_back(Tok); ConsumeStringToken(); break; diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp index 869d9de47e..3cd1f3987a 100644 --- a/lib/Parse/ParseExpr.cpp +++ b/lib/Parse/ParseExpr.cpp @@ -769,6 +769,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression, break; } case tok::char_constant: // constant: character-constant + case tok::wide_char_constant: + case tok::utf16_char_constant: + case tok::utf32_char_constant: Res = Actions.ActOnCharacterConstant(Tok); ConsumeToken(); break; @@ -780,6 +783,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression, break; case tok::string_literal: // primary-expression: string-literal case tok::wide_string_literal: + case tok::utf8_string_literal: + case tok::utf16_string_literal: + case tok::utf32_string_literal: Res = ParseStringLiteralExpression(); break; case tok::kw__Generic: // primary-expression: generic-selection [C1X 6.5.1] diff --git a/lib/Parse/ParseTentative.cpp b/lib/Parse/ParseTentative.cpp index 2ba0fc673f..3f245a376c 100644 --- a/lib/Parse/ParseTentative.cpp +++ b/lib/Parse/ParseTentative.cpp @@ -605,8 +605,14 @@ Parser::isExpressionOrTypeSpecifierSimple(tok::TokenKind Kind) { // Obviously starts an expression. case tok::numeric_constant: case tok::char_constant: + case tok::wide_char_constant: + case tok::utf16_char_constant: + case tok::utf32_char_constant: case tok::string_literal: case tok::wide_string_literal: + case tok::utf8_string_literal: + case tok::utf16_string_literal: + case tok::utf32_string_literal: case tok::l_square: case tok::l_paren: case tok::amp: diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 9dc867c92b..5bb4165fbb 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -298,6 +298,9 @@ bool Parser::SkipUntil(const tok::TokenKind *Toks, unsigned NumToks, case tok::string_literal: case tok::wide_string_literal: + case tok::utf8_string_literal: + case tok::utf16_string_literal: + case tok::utf32_string_literal: ConsumeStringToken(); break; diff --git a/lib/Rewrite/HTMLRewrite.cpp b/lib/Rewrite/HTMLRewrite.cpp index 27f383f46c..ad2491c8fb 100644 --- a/lib/Rewrite/HTMLRewrite.cpp +++ b/lib/Rewrite/HTMLRewrite.cpp @@ -397,8 +397,15 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) { HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart, "", ""); break; + case tok::utf8_string_literal: + // Chop off the u part of u8 prefix + ++TokOffs; + --TokLen; + // FALL THROUGH to chop the 8 case tok::wide_string_literal: - // Chop off the L prefix + case tok::utf16_string_literal: + case tok::utf32_string_literal: + // Chop off the L, u, U or 8 prefix ++TokOffs; --TokLen; // FALL THROUGH. diff --git a/lib/Rewrite/RewriteObjC.cpp b/lib/Rewrite/RewriteObjC.cpp index 585b43cf54..a8fefb0165 100644 --- a/lib/Rewrite/RewriteObjC.cpp +++ b/lib/Rewrite/RewriteObjC.cpp @@ -2111,8 +2111,8 @@ Stmt *RewriteObjC::RewriteAtEncode(ObjCEncodeExpr *Exp) { std::string StrEncoding; Context->getObjCEncodingForType(Exp->getEncodedType(), StrEncoding); Expr *Replacement = StringLiteral::Create(*Context, StrEncoding, - false, false, StrType, - SourceLocation()); + StringLiteral::Ascii, false, + StrType, SourceLocation()); ReplaceStmt(Exp, Replacement); // Replace this subexpr in the parent. @@ -2129,8 +2129,8 @@ Stmt *RewriteObjC::RewriteAtSelector(ObjCSelectorExpr *Exp) { QualType argType = Context->getPointerType(Context->CharTy); SelExprs.push_back(StringLiteral::Create(*Context, Exp->getSelector().getAsString(), - false, false, argType, - SourceLocation())); + StringLiteral::Ascii, false, + argType, SourceLocation())); CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl, &SelExprs[0], SelExprs.size()); ReplaceStmt(Exp, SelExp); @@ -2797,7 +2797,8 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp, QualType argType = Context->getPointerType(Context->CharTy); ClsExprs.push_back(StringLiteral::Create(*Context, ClassDecl->getIdentifier()->getName(), - false, false, argType, SourceLocation())); + StringLiteral::Ascii, false, + argType, SourceLocation())); CallExpr *Cls = SynthesizeCallToFunctionDecl(GetMetaClassFunctionDecl, &ClsExprs[0], ClsExprs.size(), @@ -2875,7 +2876,7 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp, IdentifierInfo *clsName = Class->getIdentifier(); ClsExprs.push_back(StringLiteral::Create(*Context, clsName->getName(), - false, false, + StringLiteral::Ascii, false, argType, SourceLocation())); CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl, &ClsExprs[0], @@ -2906,7 +2907,8 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp, QualType argType = Context->getPointerType(Context->CharTy); ClsExprs.push_back(StringLiteral::Create(*Context, ClassDecl->getIdentifier()->getName(), - false, false, argType, SourceLocation())); + StringLiteral::Ascii, false, argType, + SourceLocation())); CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl, &ClsExprs[0], ClsExprs.size(), @@ -2987,7 +2989,8 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp, QualType argType = Context->getPointerType(Context->CharTy); SelExprs.push_back(StringLiteral::Create(*Context, Exp->getSelector().getAsString(), - false, false, argType, SourceLocation())); + StringLiteral::Ascii, false, + argType, SourceLocation())); CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl, &SelExprs[0], SelExprs.size(), StartLoc, diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index 2e4198b520..28085ef6ea 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -605,7 +605,7 @@ bool Sema::CheckObjCString(Expr *Arg) { Arg = Arg->IgnoreParenCasts(); StringLiteral *Literal = dyn_cast(Arg); - if (!Literal || Literal->isWide()) { + if (!Literal || !Literal->isAscii()) { Diag(Arg->getLocStart(), diag::err_cfstring_literal_not_string_constant) << Arg->getSourceRange(); return true; @@ -1805,7 +1805,7 @@ void Sema::CheckFormatString(const StringLiteral *FExpr, bool isPrintf) { // CHECK: is the format string a wide literal? - if (FExpr->isWide()) { + if (!FExpr->isAscii()) { Diag(FExpr->getLocStart(), diag::warn_format_string_is_wide_literal) << OrigFormatExpr->getSourceRange(); diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp index 9e20bc9018..2cbd83a8fd 100644 --- a/lib/Sema/SemaDeclAttr.cpp +++ b/lib/Sema/SemaDeclAttr.cpp @@ -712,7 +712,7 @@ static void handleWeakRefAttr(Sema &S, Decl *D, const AttributeList &Attr) { Arg = Arg->IgnoreParenCasts(); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string) << "weakref" << 1; return; @@ -737,7 +737,7 @@ static void handleAliasAttr(Sema &S, Decl *D, const AttributeList &Attr) { Arg = Arg->IgnoreParenCasts(); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string) << "alias" << 1; return; @@ -1162,7 +1162,7 @@ static void handleVisibilityAttr(Sema &S, Decl *D, const AttributeList &Attr) { Arg = Arg->IgnoreParenCasts(); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string) << "visibility" << 1; return; @@ -2464,7 +2464,7 @@ static void handleCallConvAttr(Sema &S, Decl *D, const AttributeList &Attr) { case AttributeList::AT_pcs: { Expr *Arg = Attr.getArg(0); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string) << "pcs" << 1; Attr.setInvalid(); @@ -2519,7 +2519,7 @@ bool Sema::CheckCallingConvAttr(const AttributeList &attr, CallingConv &CC) { case AttributeList::AT_pcs: { Expr *Arg = attr.getArg(0); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { Diag(attr.getLoc(), diag::err_attribute_argument_n_not_string) << "pcs" << 1; attr.setInvalid(); @@ -2868,7 +2868,7 @@ static void handleUuidAttr(Sema &S, Decl *D, const AttributeList &Attr) { Expr *Arg = Attr.getArg(0); StringLiteral *Str = dyn_cast(Arg); - if (Str == 0 || Str->isWide()) { + if (!Str || !Str->isAscii()) { S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string) << "uuid" << 1; return; diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp index 4a9b4bcfdf..dedf7b0d77 100644 --- a/lib/Sema/SemaExpr.cpp +++ b/lib/Sema/SemaExpr.cpp @@ -997,11 +997,25 @@ Sema::ActOnStringLiteral(const Token *StringToks, unsigned NumStringToks) { StringTokLocs.push_back(StringToks[i].getLocation()); QualType StrTy = Context.CharTy; - if (Literal.AnyWide) + if (Literal.isWide()) StrTy = Context.getWCharType(); + else if (Literal.isUTF16()) + StrTy = Context.Char16Ty; + else if (Literal.isUTF32()) + StrTy = Context.Char32Ty; else if (Literal.Pascal) StrTy = Context.UnsignedCharTy; + StringLiteral::StringKind Kind = StringLiteral::Ascii; + if (Literal.isWide()) + Kind = StringLiteral::Wide; + else if (Literal.isUTF8()) + Kind = StringLiteral::UTF8; + else if (Literal.isUTF16()) + Kind = StringLiteral::UTF16; + else if (Literal.isUTF32()) + Kind = StringLiteral::UTF32; + // A C++ string literal has a const-qualified element type (C++ 2.13.4p1). if (getLangOptions().CPlusPlus || getLangOptions().ConstStrings) StrTy.addConst(); @@ -1015,7 +1029,7 @@ Sema::ActOnStringLiteral(const Token *StringToks, unsigned NumStringToks) { // Pass &StringTokLocs[0], StringTokLocs.size() to factory! return Owned(StringLiteral::Create(Context, Literal.GetString(), - Literal.AnyWide, Literal.Pascal, StrTy, + Kind, Literal.Pascal, StrTy, &StringTokLocs[0], StringTokLocs.size())); } @@ -2412,7 +2426,7 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok) { return ExprError(); CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), Tok.getLocation(), - PP); + PP, Tok.getKind()); if (Literal.hadError()) return ExprError(); @@ -2421,14 +2435,25 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok) { Ty = Context.IntTy; // 'x' and L'x' -> int in C. else if (Literal.isWide()) Ty = Context.WCharTy; // L'x' -> wchar_t in C++. + else if (Literal.isUTF16()) + Ty = Context.Char16Ty; // u'x' -> char16_t in C++0x. + else if (Literal.isUTF32()) + Ty = Context.Char32Ty; // U'x' -> char32_t in C++0x. else if (Literal.isMultiChar()) Ty = Context.IntTy; // 'wxyz' -> int in C++. else Ty = Context.CharTy; // 'x' -> char in C++ - return Owned(new (Context) CharacterLiteral(Literal.getValue(), - Literal.isWide(), - Ty, Tok.getLocation())); + CharacterLiteral::CharacterKind Kind = CharacterLiteral::Ascii; + if (Literal.isWide()) + Kind = CharacterLiteral::Wide; + else if (Literal.isUTF16()) + Kind = CharacterLiteral::UTF16; + else if (Literal.isUTF32()) + Kind = CharacterLiteral::UTF32; + + return Owned(new (Context) CharacterLiteral(Literal.getValue(), Kind, Ty, + Tok.getLocation())); } ExprResult Sema::ActOnNumericConstant(const Token &Tok) { @@ -8624,7 +8649,7 @@ static void MakeObjCStringLiteralFixItHint(Sema& SemaRef, QualType DstType, // Strip off any parens and casts. StringLiteral *SL = dyn_cast(SrcExpr->IgnoreParenCasts()); - if (!SL || SL->isWide()) + if (!SL || !SL->isAscii()) return; Hint = FixItHint::CreateInsertion(SL->getLocStart(), "@"); diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp index 94a5bafa7c..1812510942 100644 --- a/lib/Sema/SemaExprCXX.cpp +++ b/lib/Sema/SemaExprCXX.cpp @@ -2041,12 +2041,20 @@ Sema::IsStringLiteralToNonConstPointerConversion(Expr *From, QualType ToType) { = ToPtrType->getPointeeType()->getAs()) { // This conversion is considered only when there is an // explicit appropriate pointer target type (C++ 4.2p2). - if (!ToPtrType->getPointeeType().hasQualifiers() && - ((StrLit->isWide() && ToPointeeType->isWideCharType()) || - (!StrLit->isWide() && - (ToPointeeType->getKind() == BuiltinType::Char_U || - ToPointeeType->getKind() == BuiltinType::Char_S)))) - return true; + if (!ToPtrType->getPointeeType().hasQualifiers()) { + switch (StrLit->getKind()) { + case StringLiteral::UTF8: + case StringLiteral::UTF16: + case StringLiteral::UTF32: + // We don't allow UTF literals to be implicitly converted + break; + case StringLiteral::Ascii: + return (ToPointeeType->getKind() == BuiltinType::Char_U || + ToPointeeType->getKind() == BuiltinType::Char_S); + case StringLiteral::Wide: + return ToPointeeType->isWideCharType(); + } + } } return false; diff --git a/lib/Sema/SemaExprObjC.cpp b/lib/Sema/SemaExprObjC.cpp index fccea7c0e1..e88726b6d2 100644 --- a/lib/Sema/SemaExprObjC.cpp +++ b/lib/Sema/SemaExprObjC.cpp @@ -47,8 +47,8 @@ ExprResult Sema::ParseObjCStringLiteral(SourceLocation *AtLocs, for (unsigned i = 0; i != NumStrings; ++i) { S = Strings[i]; - // ObjC strings can't be wide. - if (S->isWide()) { + // ObjC strings can't be wide or UTF. + if (!S->isAscii()) { Diag(S->getLocStart(), diag::err_cfstring_literal_not_string_constant) << S->getSourceRange(); return true; @@ -64,7 +64,7 @@ ExprResult Sema::ParseObjCStringLiteral(SourceLocation *AtLocs, // Create the aggregate string with the appropriate content and location // information. S = StringLiteral::Create(Context, StrBuf, - /*Wide=*/false, /*Pascal=*/false, + StringLiteral::Ascii, /*Pascal=*/false, Context.getPointerType(Context.CharTy), &StrLocs[0], StrLocs.size()); } diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp index adf88c62cc..c406ad9840 100644 --- a/lib/Sema/SemaInit.cpp +++ b/lib/Sema/SemaInit.cpp @@ -49,20 +49,30 @@ static Expr *IsStringInit(Expr *Init, const ArrayType *AT, if (SL == 0) return 0; QualType ElemTy = Context.getCanonicalType(AT->getElementType()); - // char array can be initialized with a narrow string. - // Only allow char x[] = "foo"; not char x[] = L"foo"; - if (!SL->isWide()) + + switch (SL->getKind()) { + case StringLiteral::Ascii: + case StringLiteral::UTF8: + // char array can be initialized with a narrow string. + // Only allow char x[] = "foo"; not char x[] = L"foo"; return ElemTy->isCharType() ? Init : 0; + case StringLiteral::UTF16: + return ElemTy->isChar16Type() ? Init : 0; + case StringLiteral::UTF32: + return ElemTy->isChar32Type() ? Init : 0; + case StringLiteral::Wide: + // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with + // correction from DR343): "An array with element type compatible with a + // qualified or unqualified version of wchar_t may be initialized by a wide + // string literal, optionally enclosed in braces." + if (Context.typesAreCompatible(Context.getWCharType(), + ElemTy.getUnqualifiedType())) + return Init; - // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with - // correction from DR343): "An array with element type compatible with a - // qualified or unqualified version of wchar_t may be initialized by a wide - // string literal, optionally enclosed in braces." - if (Context.typesAreCompatible(Context.getWCharType(), - ElemTy.getUnqualifiedType())) - return Init; + return 0; + } - return 0; + llvm_unreachable("missed a StringLiteral kind?"); } static Expr *IsStringInit(Expr *init, QualType declType, ASTContext &Context) { diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp index 0fd3f03982..56161ed9b4 100644 --- a/lib/Sema/SemaStmt.cpp +++ b/lib/Sema/SemaStmt.cpp @@ -1952,13 +1952,13 @@ StmtResult Sema::ActOnAsmStmt(SourceLocation AsmLoc, bool IsSimple, SmallVector OutputConstraintInfos; // The parser verifies that there is a string literal here. - if (AsmString->isWide()) + if (!AsmString->isAscii()) return StmtError(Diag(AsmString->getLocStart(),diag::err_asm_wide_character) << AsmString->getSourceRange()); for (unsigned i = 0; i != NumOutputs; i++) { StringLiteral *Literal = Constraints[i]; - if (Literal->isWide()) + if (!Literal->isAscii()) return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character) << Literal->getSourceRange()); @@ -1987,7 +1987,7 @@ StmtResult Sema::ActOnAsmStmt(SourceLocation AsmLoc, bool IsSimple, for (unsigned i = NumOutputs, e = NumOutputs + NumInputs; i != e; i++) { StringLiteral *Literal = Constraints[i]; - if (Literal->isWide()) + if (!Literal->isAscii()) return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character) << Literal->getSourceRange()); @@ -2034,7 +2034,7 @@ StmtResult Sema::ActOnAsmStmt(SourceLocation AsmLoc, bool IsSimple, // Check that the clobbers are valid. for (unsigned i = 0; i != NumClobbers; i++) { StringLiteral *Literal = Clobbers[i]; - if (Literal->isWide()) + if (!Literal->isAscii()) return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character) << Literal->getSourceRange()); diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp index ceab7e93ac..006017f5a4 100644 --- a/lib/Sema/SemaTemplate.cpp +++ b/lib/Sema/SemaTemplate.cpp @@ -4131,10 +4131,22 @@ Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg, assert(Arg.getKind() == TemplateArgument::Integral && "Operation is only valid for integral template arguments"); QualType T = Arg.getIntegralType(); - if (T->isCharType() || T->isWideCharType()) + if (T->isAnyCharacterType()) { + CharacterLiteral::CharacterKind Kind; + if (T->isWideCharType()) + Kind = CharacterLiteral::Wide; + else if (T->isChar16Type()) + Kind = CharacterLiteral::UTF16; + else if (T->isChar32Type()) + Kind = CharacterLiteral::UTF32; + else + Kind = CharacterLiteral::Ascii; + return Owned(new (Context) CharacterLiteral( - Arg.getAsIntegral()->getZExtValue(), - T->isWideCharType(), T, Loc)); + Arg.getAsIntegral()->getZExtValue(), + Kind, T, Loc)); + } + if (T->isBooleanType()) return Owned(new (Context) CXXBoolLiteralExpr( Arg.getAsIntegral()->getBoolValue(), diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp index 3559ccecd3..7a3c589c21 100644 --- a/lib/Serialization/ASTReaderStmt.cpp +++ b/lib/Serialization/ASTReaderStmt.cpp @@ -371,7 +371,7 @@ void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { assert(Record[Idx] == E->getNumConcatenated() && "Wrong number of concatenated tokens!"); ++Idx; - E->IsWide = Record[Idx++]; + E->Kind = static_cast(Record[Idx++]); E->IsPascal = Record[Idx++]; // Read string data @@ -388,7 +388,7 @@ void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { VisitExpr(E); E->setValue(Record[Idx++]); E->setLocation(ReadSourceLocation(Record, Idx)); - E->setWide(Record[Idx++]); + E->setKind(static_cast(Record[Idx++])); } void ASTStmtReader::VisitParenExpr(ParenExpr *E) { diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp index 0b5bc1fcfe..f0636a1aa1 100644 --- a/lib/Serialization/ASTWriterStmt.cpp +++ b/lib/Serialization/ASTWriterStmt.cpp @@ -324,7 +324,7 @@ void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); Record.push_back(E->getByteLength()); Record.push_back(E->getNumConcatenated()); - Record.push_back(E->isWide()); + Record.push_back(E->getKind()); Record.push_back(E->isPascal()); // FIXME: String data should be stored as a blob at the end of the // StringLiteral. However, we can't do so now because we have no @@ -340,7 +340,7 @@ void ASTStmtWriter::VisitCharacterLiteral(CharacterLiteral *E) { VisitExpr(E); Record.push_back(E->getValue()); Writer.AddSourceLocation(E->getLocation(), Record); - Record.push_back(E->isWide()); + Record.push_back(E->getKind()); AbbrevToUse = Writer.getCharacterLiteralAbbrev(); diff --git a/test/CXX/lex/lex.literal/lex.ccon/p1.cpp b/test/CXX/lex/lex.literal/lex.ccon/p1.cpp index 7b65f7ee83..6df035d63f 100644 --- a/test/CXX/lex/lex.literal/lex.ccon/p1.cpp +++ b/test/CXX/lex/lex.literal/lex.ccon/p1.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++0x -fsyntax-only -verify %s +// Runs in c++0x mode so that char16_t and char32_t are available. // Check types of char literals extern char a; @@ -7,3 +8,7 @@ extern int b; extern __typeof('asdf') b; extern wchar_t c; extern __typeof(L'a') c; +extern char16_t d; +extern __typeof(u'a') d; +extern char32_t e; +extern __typeof(U'a') e; diff --git a/test/CodeGen/char-literal.c b/test/CodeGen/char-literal.c index 322041c004..014f6eb4fb 100644 --- a/test/CodeGen/char-literal.c +++ b/test/CodeGen/char-literal.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// Runs in c++ mode so that wchar_t is available. +// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available. int main() { // CHECK: store i8 97 @@ -16,6 +16,20 @@ int main() { // CHECK: store i32 98 wchar_t wb = L'ab'; + // CHECK: store i16 97 + char16_t ua = u'a'; + + // Should pick second character. + // CHECK: store i16 98 + char16_t ub = u'ab'; + + // CHECK: store i32 97 + char32_t Ua = U'a'; + + // Should pick second character. + // CHECK: store i32 98 + char32_t Ub = U'ab'; + // Should pick last character and store its lowest byte. // This does not match gcc, which takes the last character, converts it to // utf8, and then picks the second-lowest byte of that (they probably store @@ -26,10 +40,36 @@ int main() { // CHECK: store i32 61451 wchar_t wc = L'\uF00B'; + // -4085 == 0xf00b + // CHECK: store i16 -4085 + char16_t uc = u'\uF00B'; + + // CHECK: store i32 61451 + char32_t Uc = U'\uF00B'; + // CHECK: store i32 1110027 wchar_t wd = L'\U0010F00B'; + // Should take lower word of the 4byte UNC sequence. This does not match + // gcc. I don't understand what gcc does (it looks like it converts to utf16, + // then takes the second (!) utf16 word, swaps the lower two nibbles, and + // stores that?). + // CHECK: store i16 -4085 + char16_t ud = u'\U0010F00B'; // has utf16 encoding dbc8 dcb0 + + // CHECK: store i32 1110027 + char32_t Ud = U'\U0010F00B'; + // Should pick second character. // CHECK: store i32 1110027 wchar_t we = L'\u1234\U0010F00B'; + + // Should pick second character. + // CHECK: store i16 -4085 + char16_t ue = u'\u1234\U0010F00B'; + + // Should pick second character. + // CHECK: store i32 1110027 + char32_t Ue = U'\u1234\U0010F00B'; + } diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c index cc6c0943d9..6d14330a0b 100644 --- a/test/CodeGen/string-literal.c +++ b/test/CodeGen/string-literal.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available. int main() { // CHECK: internal unnamed_addr constant [10 x i8] c"abc\00\00\00\00\00\00\00", align 1 @@ -9,8 +10,24 @@ int main() { char b[10] = "\u1120\u0220\U00102030"; // CHECK: private unnamed_addr constant [12 x i8] c"A\00\00\00B\00\00\00\00\00\00\00", align 1 - void *foo = L"AB"; + const wchar_t *foo = L"AB"; // CHECK: private unnamed_addr constant [12 x i8] c"4\12\00\00\0B\F0\10\00\00\00\00\00", align 1 - void *bar = L"\u1234\U0010F00B"; + const wchar_t *bar = L"\u1234\U0010F00B"; + + // CHECK: private unnamed_addr constant [12 x i8] c"C\00\00\00D\00\00\00\00\00\00\00", align 1 + const char32_t *c = U"CD"; + + // CHECK: private unnamed_addr constant [12 x i8] c"5\12\00\00\0C\F0\10\00\00\00\00\00", align 1 + const char32_t *d = U"\u1235\U0010F00C"; + + // CHECK: private unnamed_addr constant [6 x i8] c"E\00F\00\00\00", align 1 + const char16_t *e = u"EF"; + + // This should convert to utf16. + // CHECK: private unnamed_addr constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00", align 1 + const char16_t *f = u"\u1120\u0220\U00102030"; + + // CHECK: private unnamed_addr constant [4 x i8] c"def\00", align 1 + const char *g = u8"def"; } diff --git a/test/Lexer/wchar.c b/test/Lexer/wchar.c index ac82c1f73b..648a38ef3f 100644 --- a/test/Lexer/wchar.c +++ b/test/Lexer/wchar.c @@ -5,8 +5,8 @@ void f() { (void)L'\U00010000'; // expected-warning {{character unicode escape sequence too long for its type}} - (void)L'ab'; // expected-warning {{extraneous characters in wide character constant ignored}} + (void)L'ab'; // expected-warning {{extraneous characters in character constant ignored}} - (void)L'a\u1000'; // expected-warning {{extraneous characters in wide character constant ignored}} + (void)L'a\u1000'; // expected-warning {{extraneous characters in character constant ignored}} } diff --git a/test/Parser/char-literal-printing.c b/test/Parser/char-literal-printing.c index 5843e5f401..a0cafd6441 100644 --- a/test/Parser/char-literal-printing.c +++ b/test/Parser/char-literal-printing.c @@ -1,6 +1,5 @@ -// RUN: %clang_cc1 -ast-print %s - -#include +// RUN: %clang_cc1 -x c++ -std=c++0x -ast-print %s +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available. char test1(void) { return '\\'; } wchar_t test2(void) { return L'\\'; } @@ -29,3 +28,35 @@ char test23(void) { return '\x3'; } wchar_t test24(void) { return L'\x3'; } wchar_t test25(void) { return L'\x333'; } + +char16_t test26(void) { return u'\\'; } +char16_t test27(void) { return u'\''; } +char16_t test28(void) { return u'\a'; } +char16_t test29(void) { return u'\b'; } +char16_t test30(void) { return u'\e'; } +char16_t test31(void) { return u'\f'; } +char16_t test32(void) { return u'\n'; } +char16_t test33(void) { return u'\r'; } +char16_t test34(void) { return u'\t'; } +char16_t test35(void) { return u'\v'; } + +char16_t test36(void) { return u'c'; } +char16_t test37(void) { return u'\x3'; } + +char16_t test38(void) { return u'\x333'; } + +char32_t test39(void) { return U'\\'; } +char32_t test40(void) { return U'\''; } +char32_t test41(void) { return U'\a'; } +char32_t test42(void) { return U'\b'; } +char32_t test43(void) { return U'\e'; } +char32_t test44(void) { return U'\f'; } +char32_t test45(void) { return U'\n'; } +char32_t test46(void) { return U'\r'; } +char32_t test47(void) { return U'\t'; } +char32_t test48(void) { return U'\v'; } + +char32_t test49(void) { return U'c'; } +char32_t test50(void) { return U'\x3'; } + +char32_t test51(void) { return U'\x333'; } diff --git a/test/SemaCXX/type-convert-construct.cpp b/test/SemaCXX/type-convert-construct.cpp index 479af21476..a367633e48 100644 --- a/test/SemaCXX/type-convert-construct.cpp +++ b/test/SemaCXX/type-convert-construct.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=gnu++0x -fsyntax-only -verify %s +// Runs in c++0x mode so that char16_t and char32_t are available. void f() { float v1 = float(1); @@ -14,4 +15,8 @@ void f() { str = "a string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}} wchar_t *wstr; wstr = L"a wide string"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}} + char16_t *ustr; + ustr = u"a UTF-16 string"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [16]'}} + char32_t *Ustr; + Ustr = U"a UTF-32 string"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [16]'}} } -- 2.40.0