From 30cddaec99fa6c3207613efdaedbb51dd8d70c77 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Wed, 28 Nov 2012 07:29:00 +0000
Subject: [PATCH] Teach Lexer::getSpelling about raw string literals.
 Specifically, if a raw string literal needs cleaning (because it contains
 line-splicing in the encoding prefix or in the ud-suffix), do not clean the
 section between the double-quotes -- that's the "raw" bit!

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@168776 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Lex/Lexer.cpp                       | 109 +++++++++++++++---------
 test/CXX/lex/lex.literal/lex.ext/p5.cpp |   7 ++
 test/CodeGen/string-literal.c           |   7 ++
 3 files changed, 81 insertions(+), 42 deletions(-)
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 4698e288c0..6cd18469e4 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -233,16 +233,67 @@ void Lexer::Stringify(SmallVectorImpl<char> &Str) {
 // Token Spelling
 //===----------------------------------------------------------------------===//
 
+/// \brief Slow case of getSpelling. Extract the characters comprising the
+/// spelling of this token from the provided input buffer.
+static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
+                              const LangOptions &LangOpts, char *Spelling) {
+  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
+
+  size_t Length = 0;
+  const char *BufEnd = BufPtr + Tok.getLength();
+
+  if (Tok.is(tok::string_literal)) {
+    // Munch the encoding-prefix and opening double-quote.
+    while (BufPtr < BufEnd) {
+      unsigned Size;
+      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+      BufPtr += Size;
+
+      if (Spelling[Length - 1] == '"')
+        break;
+    }
+
+    // Raw string literals need special handling; trigraph expansion and line
+    // splicing do not occur within their d-char-sequence nor within their
+    // r-char-sequence.
+    if (Length >= 2 &&
+        Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
+      // Search backwards from the end of the token to find the matching closing
+      // quote.
+      const char *RawEnd = BufEnd;
+      do --RawEnd; while (*RawEnd != '"');
+      size_t RawLength = RawEnd - BufPtr + 1;
+
+      // Everything between the quotes is included verbatim in the spelling.
+      memcpy(Spelling + Length, BufPtr, RawLength);
+      Length += RawLength;
+      BufPtr += RawLength;
+
+      // The rest of the token is lexed normally.
+    }
+  }
+
+  while (BufPtr < BufEnd) {
+    unsigned Size;
+    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
+  }
+
+  assert(Length < Tok.getLength() &&
+         "NeedsCleaning flag set on token that didn't need cleaning!");
+  return Length;
+}
+
 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
 /// token are the characters used to represent the token in the source file
 /// after trigraph expansion and escaped-newline folding.  In particular, this
 /// wants to get the true, uncanonicalized, spelling of things like digraphs
 /// UCNs, etc.
 StringRef Lexer::getSpelling(SourceLocation loc,
-                                   SmallVectorImpl<char> &buffer,
-                                   const SourceManager &SM,
-                                   const LangOptions &options,
-                                   bool *invalid) {
+                             SmallVectorImpl<char> &buffer,
+                             const SourceManager &SM,
+                             const LangOptions &options,
+                             bool *invalid) {
   // Break down the source location.
   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
 
@@ -267,17 +318,10 @@ StringRef Lexer::getSpelling(SourceLocation loc,
   // Common case:  no need for cleaning.
   if (!token.needsCleaning())
     return StringRef(tokenBegin, length);
-  
-  // Hard case, we need to relex the characters into the string.
-  buffer.clear();
-  buffer.reserve(length);
-  
-  for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) {
-    unsigned charSize;
-    buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options));
-    ti += charSize;
-  }
 
+  // Hard case, we need to relex the characters into the string.
+  buffer.resize(length);
+  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
   return StringRef(buffer.data(), buffer.size());
 }
 
@@ -289,31 +333,22 @@ StringRef Lexer::getSpelling(SourceLocation loc,
 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
                                const LangOptions &LangOpts, bool *Invalid) {
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
-  
-  // If this token contains nothing interesting, return it directly.
+
   bool CharDataInvalid = false;
-  const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 
+  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
                                                     &CharDataInvalid);
   if (Invalid)
     *Invalid = CharDataInvalid;
   if (CharDataInvalid)
     return std::string();
-  
+
+  // If this token contains nothing interesting, return it directly.
   if (!Tok.needsCleaning())
-    return std::string(TokStart, TokStart+Tok.getLength());
-  
+    return std::string(TokStart, TokStart + Tok.getLength());
+
   std::string Result;
-  Result.reserve(Tok.getLength());
-  
-  // Otherwise, hard case, relex the characters into the string.
-  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
-       Ptr != End; ) {
-    unsigned CharSize;
-    Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts));
-    Ptr += CharSize;
-  }
-  assert(Result.size() != unsigned(Tok.getLength()) &&
-         "NeedsCleaning flag set on something that didn't need cleaning!");
+  Result.resize(Tok.getLength());
+  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
   return Result;
 }
 
@@ -365,17 +400,7 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
   }
 
   // Otherwise, hard case, relex the characters into the string.
-  char *OutBuf = const_cast<char*>(Buffer);
-  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
-       Ptr != End; ) {
-    unsigned CharSize;
-    *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts);
-    Ptr += CharSize;
-  }
-  assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
-         "NeedsCleaning flag set on something that didn't need cleaning!");
-
-  return OutBuf-Buffer;
+  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
 }
 
 
diff --git a/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/test/CXX/lex/lex.literal/lex.ext/p5.cpp
index 4655aa17dc..06c091d8ac 100644
--- a/test/CXX/lex/lex.literal/lex.ext/p5.cpp
+++ b/test/CXX/lex/lex.literal/lex.ext/p5.cpp
@@ -11,3 +11,10 @@ double &i3 = L"foo"_x1; // expected-error {{no matching literal operator}}
 char &operator "" _x1(const wchar_t *, size_t);
 char &i4 = L"foo"_x1; // ok
 double &i5 = R"(foo)"_x1; // ok
+double &i6 = u\
+8\
+R\
+"(foo)"\
+_\
+x\
+1; // ok
diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c
index 12d431a454..962b19d3dd 100644
--- a/test/CodeGen/string-literal.c
+++ b/test/CodeGen/string-literal.c
@@ -76,5 +76,12 @@ def)";
   const char *q = R"(abc
 def)" "ghi";
 
+  // CHECK-CPP0X: private unnamed_addr constant [13 x i8] c"abc\5C\0A??=\0Adef\00", align 1
+  const char *r = R\
+"(abc\
+??=
+def)";
+
+
 #endif
 }
-- 
2.40.0