From 2fa4e86b4fdada3b9ecbbbd99965b83ed879f69b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 11 Aug 2011 04:06:15 +0000 Subject: [PATCH] Add support for C++0x raw string literals. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@137298 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/DiagnosticLexKinds.td | 9 + include/clang/Lex/Lexer.h | 2 + include/clang/Lex/LiteralSupport.h | 1 + lib/Lex/Lexer.cpp | 175 ++++++++++++-- lib/Lex/LiteralSupport.cpp | 222 ++++++++++++------ lib/Lex/TokenConcatenation.cpp | 48 ++-- test/CodeGen/string-literal.c | 25 +- test/Lexer/cxx0x_raw_string_delim_length.cpp | 6 + test/Lexer/cxx0x_raw_string_unterminated.cpp | 8 + test/SemaCXX/cxx0x-type-convert-construct.cpp | 11 + 10 files changed, 396 insertions(+), 111 deletions(-) create mode 100644 test/Lexer/cxx0x_raw_string_delim_length.cpp create mode 100644 test/Lexer/cxx0x_raw_string_unterminated.cpp diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index e23921be0b..1347ceb8c9 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -55,6 +55,15 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">; def err_conflict_marker : Error<"version control conflict marker in file">; +def err_raw_delim_too_long : Error< + "raw string delimiter longer than 16 characters" + "; use PREFIX( )PREFIX to delimit raw string">; +def err_invalid_char_raw_delim : Error< + "invalid character '%0' character in raw string delimiter" + "; use PREFIX( )PREFIX to delimit raw string">; +def err_unterminated_raw_string : Error< + "raw string missing terminating delimiter )%0\"">; + def ext_multichar_character_literal : ExtWarn< "multi-character character constant">, InGroup; def ext_four_char_character_literal : Extension< diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index e24fe9c9ab..3bc44b192a 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -485,6 +485,8 @@ private: void LexNumericConstant (Token &Result, const char *CurPtr); void LexStringLiteral (Token &Result, const char *CurPtr, tok::TokenKind Kind); + void LexRawStringLiteral (Token &Result, const char *CurPtr, + tok::TokenKind Kind); void LexAngledStringLiteral(Token &Result, const char *CurPtr); void LexCharConstant (Token &Result, const char *CurPtr, tok::TokenKind Kind); diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index 15057299b2..3a3782a3b2 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -197,6 +197,7 @@ public: private: void init(const Token *StringToks, unsigned NumStringToks); + void CopyStringFragment(const StringRef &Fragment); }; } // end namespace clang diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 0664cbc21b..0c32c8d9ba 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" #include +#include using namespace clang; static void InitCharacterInfo(); @@ -760,7 +761,8 @@ enum { CHAR_LETTER = 0x04, // a-z,A-Z CHAR_NUMBER = 0x08, // 0-9 CHAR_UNDER = 0x10, // _ - CHAR_PERIOD = 0x20 // . + CHAR_PERIOD = 0x20, // . + CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' }; // Statically initialize CharInfo table based on ASCII character set @@ -785,20 +787,20 @@ static const unsigned char CharInfo[256] = 0 , 0 , 0 , 0 , //32 SP 33 ! 34 " 35 # //36 $ 37 % 38 & 39 ' - CHAR_HORZ_WS, 0 , 0 , 0 , - 0 , 0 , 0 , 0 , + CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , //40 ( 41 ) 42 * 43 + //44 , 45 - 46 . 47 / - 0 , 0 , 0 , 0 , - 0 , 0 , CHAR_PERIOD , 0 , + 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , //48 0 49 1 50 2 51 3 //52 4 53 5 54 6 55 7 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , //56 8 57 9 58 : 59 ; //60 < 61 = 62 > 63 ? - CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , - 0 , 0 , 0 , 0 , + CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , //64 @ 65 A 66 B 67 C //68 D 69 E 70 F 71 G 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , @@ -813,8 +815,8 @@ static const unsigned char CharInfo[256] = CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , //88 X 89 Y 90 Z 91 [ //92 \ 93 ] 94 ^ 95 _ - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , - 0 , 0 , 0 , CHAR_UNDER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , + 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , //96 ` 97 a 98 b 99 c //100 d 101 e 102 f 103 g 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , @@ -828,9 +830,9 @@ static const unsigned char CharInfo[256] = CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , //120 x 121 y 122 z 123 { -//124 | 125 } 126 ~ 127 DEL - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , - 0 , 0 , 0 , 0 +//124 | 125 } 126 ~ 127 DEL + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 }; static void InitCharacterInfo() { @@ -888,6 +890,14 @@ static inline bool isNumberBody(unsigned char c) { true : false; } +/// isRawStringDelimBody - Return true if this is the body character of a +/// raw string delimiter. +static inline bool isRawStringDelimBody(unsigned char c) { + return (CharInfo[c] & + (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? + true : false; +} + //===----------------------------------------------------------------------===// // Diagnostics forwarding code. @@ -1363,6 +1373,78 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, Result.setLiteralData(TokStart); } +/// LexRawStringLiteral - Lex the remainder of a raw string literal, after +/// having lexed R", LR", u8R", uR", or UR". +void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { + // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: + // Between the initial and final double quote characters of the raw string, + // any transformations performed in phases 1 and 2 (trigraphs, + // universal-character-names, and line splicing) are reverted. + + unsigned PrefixLen = 0; + + while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) + ++PrefixLen; + + // If the last character was not a '(', then we didn't lex a valid delimiter. + if (CurPtr[PrefixLen] != '(') { + if (!isLexingRawMode()) { + const char *PrefixEnd = &CurPtr[PrefixLen]; + if (PrefixLen == 16) { + Diag(PrefixEnd, diag::err_raw_delim_too_long); + } else { + Diag(PrefixEnd, diag::err_invalid_char_raw_delim) + << StringRef(PrefixEnd, 1); + } + } + + // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, + // it's possible the '"' was intended to be part of the raw string, but + // there's not much we can do about that. + while (1) { + char C = *CurPtr++; + + if (C == '"') + break; + if (C == 0 && CurPtr-1 == BufferEnd) { + --CurPtr; + break; + } + } + + FormTokenWithChars(Result, CurPtr, tok::unknown); + return; + } + + // Save prefix and move CurPtr past it + const char *Prefix = CurPtr; + CurPtr += PrefixLen + 1; // skip over prefix and '(' + + while (1) { + char C = *CurPtr++; + + if (C == ')') { + // Check for prefix match and closing quote. + if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { + CurPtr += PrefixLen + 1; // skip over prefix and '"' + break; + } + } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. + if (!isLexingRawMode()) + Diag(BufferPtr, diag::err_unterminated_raw_string) + << StringRef(Prefix, PrefixLen); + FormTokenWithChars(Result, CurPtr-1, tok::unknown); + return; + } + } + + // Update the location of token as well as BufferPtr. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, Kind); + Result.setLiteralData(TokStart); +} + /// LexAngledStringLiteral - Lex the remainder of an angled string literal, /// after having lexed the '<' character. This is used for #include filenames. void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { @@ -2262,12 +2344,36 @@ LexNextToken: return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), tok::utf16_char_constant); - // UTF-8 string literal - if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') - return LexStringLiteral(Result, - ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), - SizeTmp2, Result), - tok::utf8_string_literal); + // UTF-16 raw string literal + if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf16_string_literal); + + if (Char == '8') { + char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); + + // UTF-8 string literal + if (Char2 == '"') + return LexStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_string_literal); + + if (Char2 == 'R') { + unsigned SizeTmp3; + char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); + // UTF-8 raw string literal + if (Char3 == '"') { + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + SizeTmp3, Result), + tok::utf8_string_literal); + } + } + } } // treat u like the start of an identifier. @@ -2289,11 +2395,34 @@ LexNextToken: if (Char == '\'') return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), tok::utf32_char_constant); + + // UTF-32 raw string literal + if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf32_string_literal); } // treat U like the start of an identifier. return LexIdentifier(Result, CurPtr); + case 'R': // Identifier or C++0x raw string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (Features.CPlusPlus0x) { + Char = getCharAndSize(CurPtr, SizeTmp); + + if (Char == '"') + return LexRawStringLiteral(Result, + ConsumeChar(CurPtr, SizeTmp, Result), + tok::string_literal); + } + + // treat R like the start of an identifier. + return LexIdentifier(Result, CurPtr); + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); @@ -2304,6 +2433,14 @@ LexNextToken: return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), tok::wide_string_literal); + // Wide raw string literal. + if (Features.CPlusPlus0x && Char == 'R' && + getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::wide_string_literal); + // Wide character constant. if (Char == '\'') return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), @@ -2313,7 +2450,7 @@ LexNextToken: // C99 6.4.2: Identifiers. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': - case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': /*'U'*/ + case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index a40908bd9f..c74b1466f3 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -713,6 +713,38 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { } +/// character-literal: [C++0x lex.ccon] +/// ' c-char-sequence ' +/// u' c-char-sequence ' +/// U' c-char-sequence ' +/// L' c-char-sequence ' +/// c-char-sequence: +/// c-char +/// c-char-sequence c-char +/// c-char: +/// any member of the source character set except the single-quote ', +/// backslash \, or new-line character +/// escape-sequence +/// universal-character-name +/// escape-sequence: [C++0x lex.ccon] +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \’ \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit +/// universal-character-name: +/// \u hex-quad +/// \U hex-quad hex-quad +/// hex-quad: +/// hex-digit hex-digit hex-digit hex-digit +/// CharLiteralParser::CharLiteralParser(const char *begin, const char *end, SourceLocation Loc, Preprocessor &PP, tok::TokenKind kind) { @@ -825,34 +857,52 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } -/// string-literal: [C99 6.4.5] -/// " [s-char-sequence] " -/// L" [s-char-sequence] " +/// string-literal: [C++0x lex.string] +/// encoding-prefix " [s-char-sequence] " +/// encoding-prefix R raw-string +/// encoding-prefix: +/// u8 +/// u +/// U +/// L /// s-char-sequence: /// s-char /// s-char-sequence s-char /// s-char: -/// any source character except the double quote ", -/// backslash \, or newline character -/// escape-character -/// universal-character-name -/// escape-character: [C99 6.4.4.4] -/// \ escape-code +/// any member of the source character set except the double-quote ", +/// backslash \, or new-line character +/// escape-sequence /// universal-character-name -/// escape-code: -/// character-escape-code -/// octal-escape-code -/// hex-escape-code -/// character-escape-code: one of -/// n t b r f v a -/// \ ' " ? -/// octal-escape-code: -/// octal-digit -/// octal-digit octal-digit -/// octal-digit octal-digit octal-digit -/// hex-escape-code: -/// x hex-digit -/// hex-escape-code hex-digit +/// raw-string: +/// " d-char-sequence ( r-char-sequence ) d-char-sequence " +/// r-char-sequence: +/// r-char +/// r-char-sequence r-char +/// r-char: +/// any member of the source character set, except a right parenthesis ) +/// followed by the initial d-char-sequence (which may be empty) +/// followed by a double quote ". +/// d-char-sequence: +/// d-char +/// d-char-sequence d-char +/// d-char: +/// any member of the basic source character set except: +/// space, the left parenthesis (, the right parenthesis ), +/// the backslash \, and the control characters representing horizontal +/// tab, vertical tab, form feed, and newline. +/// escape-sequence: [C++0x lex.ccon] +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \’ \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: /// \u hex-quad /// \U hex-quad hex-quad @@ -972,64 +1022,69 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ ++ThisTokBuf; } - assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); - ++ThisTokBuf; + // Check for raw string + if (ThisTokBuf[0] == 'R') { + ThisTokBuf += 2; // skip R" - // Check if this is a pascal string - if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && - ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { - - // If the \p sequence is found in the first token, we have a pascal string - // Otherwise, if we already have a pascal string, ignore the first \p - if (i == 0) { + const char *Prefix = ThisTokBuf; + while (ThisTokBuf[0] != '(') ++ThisTokBuf; - Pascal = true; - } else if (Pascal) - ThisTokBuf += 2; - } + ++ThisTokBuf; // skip '(' + + // remove same number of characters from the end + if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix)) + ThisTokEnd -= (ThisTokBuf - Prefix); + + // Copy the string over + CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)); + } else { + assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); + ++ThisTokBuf; // skip " + + // Check if this is a pascal string + if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && + ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { - while (ThisTokBuf != ThisTokEnd) { - // Is this a span of non-escape characters? - if (ThisTokBuf[0] != '\\') { - const char *InStart = ThisTokBuf; - do { + // If the \p sequence is found in the first token, we have a pascal string + // Otherwise, if we already have a pascal string, ignore the first \p + if (i == 0) { ++ThisTokBuf; - } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); - - // Copy the character span over. - unsigned Len = ThisTokBuf-InStart; - if (CharByteWidth == 1) { - memcpy(ResultPtr, InStart, Len); - ResultPtr += Len; - } else { - // Note: our internal rep of wide char tokens is always little-endian. - for (; Len; --Len, ++InStart) { - *ResultPtr++ = InStart[0]; - // Add zeros at the end. - for (unsigned i = 1, e = CharByteWidth; i != e; ++i) - *ResultPtr++ = 0; - } - } - continue; + Pascal = true; + } else if (Pascal) + ThisTokBuf += 2; } - // Is this a Universal Character Name escape? - if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { - EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, - hadError, FullSourceLoc(StringToks[i].getLocation(),SM), - CharByteWidth, Diags, Features); - continue; - } - // Otherwise, this is a non-UCN escape character. Process it. - unsigned ResultChar = - ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth*8, Diags); - // Note: our internal rep of wide char tokens is always little-endian. - *ResultPtr++ = ResultChar & 0xFF; + while (ThisTokBuf != ThisTokEnd) { + // Is this a span of non-escape characters? + if (ThisTokBuf[0] != '\\') { + const char *InStart = ThisTokBuf; + do { + ++ThisTokBuf; + } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + + // Copy the character span over. + CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)); + continue; + } + // Is this a Universal Character Name escape? + if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { + EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(),SM), + CharByteWidth, Diags, Features); + continue; + } + // Otherwise, this is a non-UCN escape character. Process it. + unsigned ResultChar = + ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), + CharByteWidth*8, Diags); + + // Note: our internal rep of wide char tokens is always little-endian. + *ResultPtr++ = ResultChar & 0xFF; - for (unsigned i = 1, e = CharByteWidth; i != e; ++i) - *ResultPtr++ = ResultChar >> i*8; + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) + *ResultPtr++ = ResultChar >> i*8; + } } } @@ -1062,6 +1117,25 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } +/// copyStringFragment - This function copies from Start to End into ResultPtr. +/// Performs widening for multi-byte characters. +void StringLiteralParser::CopyStringFragment(const StringRef &Fragment) { + // Copy the character span over. + if (CharByteWidth == 1) { + memcpy(ResultPtr, Fragment.data(), Fragment.size()); + ResultPtr += Fragment.size(); + } else { + // Note: our internal rep of wide char tokens is always little-endian. + for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) { + *ResultPtr++ = *I; + // Add zeros at the end. + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) + *ResultPtr++ = 0; + } + } +} + + /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles /// advancing over escape sequences in the string. diff --git a/lib/Lex/TokenConcatenation.cpp b/lib/Lex/TokenConcatenation.cpp index 19baf80aad..d6f3bc493f 100644 --- a/lib/Lex/TokenConcatenation.cpp +++ b/lib/Lex/TokenConcatenation.cpp @@ -17,39 +17,53 @@ using namespace clang; +/// IsStringPrefix - Return true if Str is a string prefix. +/// 'L', 'u', 'U', or 'u8'. Including raw versions. +static bool IsStringPrefix(const StringRef &Str, bool CPlusPlus0x) { + + if (Str[0] == 'L' || + (CPlusPlus0x && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { + + if (Str.size() == 1) + return true; // "L", "u", "U", and "R" + + // Check for raw flavors. Need to make sure the first character wasn't + // already R. Need CPlusPlus0x check for "LR". + if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus0x) + return true; // "LR", "uR", "UR" + + // Check for "u8" and "u8R" + if (Str[0] == 'u' && Str[1] == '8') { + if (Str.size() == 2) return true; // "u8" + if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" + } + } + + return false; +} + /// IsIdentifierStringPrefix - Return true if the spelling of the token -/// is literally 'L', 'u', 'U', or 'u8'. +/// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { const LangOptions &LangOpts = PP.getLangOptions(); if (!Tok.needsCleaning()) { - if (Tok.getLength() != 1 && Tok.getLength() != 2) + if (Tok.getLength() < 1 || Tok.getLength() > 3) return false; SourceManager &SM = PP.getSourceManager(); const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); - if (Tok.getLength() == 1) - return Ptr[0] == 'L' || - (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U')); - if (Tok.getLength() == 2) - return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8'; + return IsStringPrefix(StringRef(Ptr, Tok.getLength()), + LangOpts.CPlusPlus0x); } if (Tok.getLength() < 256) { char Buffer[256]; const char *TokPtr = Buffer; unsigned length = PP.getSpelling(Tok, TokPtr); - if (length == 1) - return TokPtr[0] == 'L' || - (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U')); - if (length == 2) - return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8'; - return false; + return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus0x); } - std::string TokStr = PP.getSpelling(Tok); - return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" || - TokStr == "u" || - TokStr == "U")); + return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus0x); } TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c index dfa609fe8a..98216423bc 100644 --- a/test/CodeGen/string-literal.c +++ b/test/CodeGen/string-literal.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s // RUN: %clang_cc1 -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s -// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s +// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=CPP0X %s #include @@ -38,5 +38,28 @@ int main() { // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"def\00", align 1 const char *g = u8"def"; + + // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"ghi\00", align 1 + const char *h = R"foo(ghi)foo"; + + // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"jkl\00", align 1 + const char *i = u8R"bar(jkl)bar"; + + // CHECK-CPP0X: private unnamed_addr constant [6 x i8] c"G\00H\00\00\00", align 2 + const char16_t *j = uR"foo(GH)foo"; + + // CHECK-CPP0X: private unnamed_addr constant [12 x i8] c"I\00\00\00J\00\00\00\00\00\00\00", align 4 + const char32_t *k = UR"bar(IJ)bar"; + + // CHECK-CPP0X: private unnamed_addr constant [12 x i8] c"K\00\00\00L\00\00\00\00\00\00\00", align 4 + const wchar_t *l = LR"bar(KL)bar"; + + // CHECK-CPP0X: private unnamed_addr constant [9 x i8] c"abc\5Cndef\00", align 1 + const char *m = R"(abc\ndef)"; + + // CHECK-CPP0X: private unnamed_addr constant [8 x i8] c"abc\0Adef\00", align 1 + const char *n = R"(abc +def)"; + #endif } diff --git a/test/Lexer/cxx0x_raw_string_delim_length.cpp b/test/Lexer/cxx0x_raw_string_delim_length.cpp new file mode 100644 index 0000000000..c7b32f8698 --- /dev/null +++ b/test/Lexer/cxx0x_raw_string_delim_length.cpp @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string delimiter longer than 16 characters' + +const char *str = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz"; +// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string delimiter longer than 16 characters' + +const char *str = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz"; diff --git a/test/Lexer/cxx0x_raw_string_unterminated.cpp b/test/Lexer/cxx0x_raw_string_unterminated.cpp new file mode 100644 index 0000000000..7813c999f0 --- /dev/null +++ b/test/Lexer/cxx0x_raw_string_unterminated.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string missing terminating delimiter )foo"' + +const char *str = R"foo(abc +def)bar"; +// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string missing terminating delimiter )foo"' + +const char *str = R"foo(abc +def)bar"; diff --git a/test/SemaCXX/cxx0x-type-convert-construct.cpp b/test/SemaCXX/cxx0x-type-convert-construct.cpp index a523108c6e..f32c8e2014 100644 --- a/test/SemaCXX/cxx0x-type-convert-construct.cpp +++ b/test/SemaCXX/cxx0x-type-convert-construct.cpp @@ -7,4 +7,15 @@ void f() { ustr = u"a UTF-16 string"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [16]'}} char32_t *Ustr; Ustr = U"a UTF-32 string"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [16]'}} + + char *Rstr; + Rstr = "a raw string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}} + wchar_t *LRstr; + LRstr = LR"foo(a wide raw string)foo"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}} + char *u8Rstr; + u8Rstr = u8R"foo(a UTF-8 raw string)foo"; // expected-error {{assigning to 'char *' from incompatible type 'const char [19]'}} + char16_t *uRstr; + uRstr = uR"foo(a UTF-16 raw string)foo"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [20]'}} + char32_t *URstr; + URstr = UR"foo(a UTF-32 raw string)foo"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [20]'}} } -- 2.40.0