From 30cddaec99fa6c3207613efdaedbb51dd8d70c77 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Wed, 28 Nov 2012 07:29:00 +0000 Subject: [PATCH] Teach Lexer::getSpelling about raw string literals. Specifically, if a raw string literal needs cleaning (because it contains line-splicing in the encoding prefix or in the ud-suffix), do not clean the section between the double-quotes -- that's the "raw" bit! git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@168776 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Lex/Lexer.cpp | 109 +++++++++++++++--------- test/CXX/lex/lex.literal/lex.ext/p5.cpp | 7 ++ test/CodeGen/string-literal.c | 7 ++ 3 files changed, 81 insertions(+), 42 deletions(-) diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 4698e288c0..6cd18469e4 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -233,16 +233,67 @@ void Lexer::Stringify(SmallVectorImpl &Str) { // Token Spelling //===----------------------------------------------------------------------===// +/// \brief Slow case of getSpelling. Extract the characters comprising the +/// spelling of this token from the provided input buffer. +static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, + const LangOptions &LangOpts, char *Spelling) { + assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); + + size_t Length = 0; + const char *BufEnd = BufPtr + Tok.getLength(); + + if (Tok.is(tok::string_literal)) { + // Munch the encoding-prefix and opening double-quote. + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + + if (Spelling[Length - 1] == '"') + break; + } + + // Raw string literals need special handling; trigraph expansion and line + // splicing do not occur within their d-char-sequence nor within their + // r-char-sequence. + if (Length >= 2 && + Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { + // Search backwards from the end of the token to find the matching closing + // quote. + const char *RawEnd = BufEnd; + do --RawEnd; while (*RawEnd != '"'); + size_t RawLength = RawEnd - BufPtr + 1; + + // Everything between the quotes is included verbatim in the spelling. + memcpy(Spelling + Length, BufPtr, RawLength); + Length += RawLength; + BufPtr += RawLength; + + // The rest of the token is lexed normally. + } + } + + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + } + + assert(Length < Tok.getLength() && + "NeedsCleaning flag set on token that didn't need cleaning!"); + return Length; +} + /// getSpelling() - Return the 'spelling' of this token. The spelling of a /// token are the characters used to represent the token in the source file /// after trigraph expansion and escaped-newline folding. In particular, this /// wants to get the true, uncanonicalized, spelling of things like digraphs /// UCNs, etc. StringRef Lexer::getSpelling(SourceLocation loc, - SmallVectorImpl &buffer, - const SourceManager &SM, - const LangOptions &options, - bool *invalid) { + SmallVectorImpl &buffer, + const SourceManager &SM, + const LangOptions &options, + bool *invalid) { // Break down the source location. std::pair locInfo = SM.getDecomposedLoc(loc); @@ -267,17 +318,10 @@ StringRef Lexer::getSpelling(SourceLocation loc, // Common case: no need for cleaning. if (!token.needsCleaning()) return StringRef(tokenBegin, length); - - // Hard case, we need to relex the characters into the string. - buffer.clear(); - buffer.reserve(length); - - for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { - unsigned charSize; - buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); - ti += charSize; - } + // Hard case, we need to relex the characters into the string. + buffer.resize(length); + buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); return StringRef(buffer.data(), buffer.size()); } @@ -289,31 +333,22 @@ StringRef Lexer::getSpelling(SourceLocation loc, std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); - - // If this token contains nothing interesting, return it directly. + bool CharDataInvalid = false; - const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), + const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); if (Invalid) *Invalid = CharDataInvalid; if (CharDataInvalid) return std::string(); - + + // If this token contains nothing interesting, return it directly. if (!Tok.needsCleaning()) - return std::string(TokStart, TokStart+Tok.getLength()); - + return std::string(TokStart, TokStart + Tok.getLength()); + std::string Result; - Result.reserve(Tok.getLength()); - - // Otherwise, hard case, relex the characters into the string. - for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); - Ptr != End; ) { - unsigned CharSize; - Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts)); - Ptr += CharSize; - } - assert(Result.size() != unsigned(Tok.getLength()) && - "NeedsCleaning flag set on something that didn't need cleaning!"); + Result.resize(Tok.getLength()); + Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); return Result; } @@ -365,17 +400,7 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, } // Otherwise, hard case, relex the characters into the string. - char *OutBuf = const_cast(Buffer); - for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); - Ptr != End; ) { - unsigned CharSize; - *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts); - Ptr += CharSize; - } - assert(unsigned(OutBuf-Buffer) != Tok.getLength() && - "NeedsCleaning flag set on something that didn't need cleaning!"); - - return OutBuf-Buffer; + return getSpellingSlow(Tok, TokStart, LangOpts, const_cast(Buffer)); } diff --git a/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/test/CXX/lex/lex.literal/lex.ext/p5.cpp index 4655aa17dc..06c091d8ac 100644 --- a/test/CXX/lex/lex.literal/lex.ext/p5.cpp +++ b/test/CXX/lex/lex.literal/lex.ext/p5.cpp @@ -11,3 +11,10 @@ double &i3 = L"foo"_x1; // expected-error {{no matching literal operator}} char &operator "" _x1(const wchar_t *, size_t); char &i4 = L"foo"_x1; // ok double &i5 = R"(foo)"_x1; // ok +double &i6 = u\ +8\ +R\ +"(foo)"\ +_\ +x\ +1; // ok diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c index 12d431a454..962b19d3dd 100644 --- a/test/CodeGen/string-literal.c +++ b/test/CodeGen/string-literal.c @@ -76,5 +76,12 @@ def)"; const char *q = R"(abc def)" "ghi"; + // CHECK-CPP0X: private unnamed_addr constant [13 x i8] c"abc\5C\0A??=\0Adef\00", align 1 + const char *r = R\ +"(abc\ +??= +def)"; + + #endif } -- 2.40.0