Teach Lexer::getSpelling about raw string literals. Specifically, if a raw

author Richard Smith <richard-llvm@metafoo.co.uk>

Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)

committer Richard Smith <richard-llvm@metafoo.co.uk>

Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)
author Richard Smith <richard-llvm@metafoo.co.uk>
Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)
committer Richard Smith <richard-llvm@metafoo.co.uk>
Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp

index 4698e288c038fefa33e023dc346511f9704340dd..6cd18469e4cccc7101eb971c0fed713e830d201a 100644 (file)
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -233,16 +233,67 @@ void Lexer::Stringify(SmallVectorImpl<char> &Str) {
  // Token Spelling
  //===----------------------------------------------------------------------===//
  
+/// \brief Slow case of getSpelling. Extract the characters comprising the
+/// spelling of this token from the provided input buffer.
+static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
+                              const LangOptions &LangOpts, char *Spelling) {
+  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
+
+  size_t Length = 0;
+  const char *BufEnd = BufPtr + Tok.getLength();
+
+  if (Tok.is(tok::string_literal)) {
+    // Munch the encoding-prefix and opening double-quote.
+    while (BufPtr < BufEnd) {
+      unsigned Size;
+      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+      BufPtr += Size;
+
+      if (Spelling[Length - 1] == '"')
+        break;
+    }
+
+    // Raw string literals need special handling; trigraph expansion and line
+    // splicing do not occur within their d-char-sequence nor within their
+    // r-char-sequence.
+    if (Length >= 2 &&
+        Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
+      // Search backwards from the end of the token to find the matching closing
+      // quote.
+      const char *RawEnd = BufEnd;
+      do --RawEnd; while (*RawEnd != '"');
+      size_t RawLength = RawEnd - BufPtr + 1;
+
+      // Everything between the quotes is included verbatim in the spelling.
+      memcpy(Spelling + Length, BufPtr, RawLength);
+      Length += RawLength;
+      BufPtr += RawLength;
+
+      // The rest of the token is lexed normally.
+    }
+  }
+
+  while (BufPtr < BufEnd) {
+    unsigned Size;
+    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
+  }
+
+  assert(Length < Tok.getLength() &&
+         "NeedsCleaning flag set on token that didn't need cleaning!");
+  return Length;
+}
+
  /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
  /// token are the characters used to represent the token in the source file
  /// after trigraph expansion and escaped-newline folding.  In particular, this
  /// wants to get the true, uncanonicalized, spelling of things like digraphs
  /// UCNs, etc.
  StringRef Lexer::getSpelling(SourceLocation loc,
-                                   SmallVectorImpl<char> &buffer,
-                                   const SourceManager &SM,
-                                   const LangOptions &options,
-                                   bool *invalid) {
+                             SmallVectorImpl<char> &buffer,
+                             const SourceManager &SM,
+                             const LangOptions &options,
+                             bool *invalid) {
    // Break down the source location.
    std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
  
@@ -267,17 +318,10 @@ StringRef Lexer::getSpelling(SourceLocation loc,
    // Common case:  no need for cleaning.
    if (!token.needsCleaning())
      return StringRef(tokenBegin, length);
-  
-  // Hard case, we need to relex the characters into the string.
-  buffer.clear();
-  buffer.reserve(length);
-  
-  for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) {
-    unsigned charSize;
-    buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options));
-    ti += charSize;
-  }
  
+  // Hard case, we need to relex the characters into the string.
+  buffer.resize(length);
+  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
    return StringRef(buffer.data(), buffer.size());
  }
  
@@ -289,31 +333,22 @@ StringRef Lexer::getSpelling(SourceLocation loc,
  std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
                                 const LangOptions &LangOpts, bool *Invalid) {
    assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
-  
-  // If this token contains nothing interesting, return it directly.
+
    bool CharDataInvalid = false;
-  const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 
+  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
                                                      &CharDataInvalid);
    if (Invalid)
      *Invalid = CharDataInvalid;
    if (CharDataInvalid)
      return std::string();
-  
+
+  // If this token contains nothing interesting, return it directly.
    if (!Tok.needsCleaning())
-    return std::string(TokStart, TokStart+Tok.getLength());
-  
+    return std::string(TokStart, TokStart + Tok.getLength());
+
    std::string Result;
-  Result.reserve(Tok.getLength());
-  
-  // Otherwise, hard case, relex the characters into the string.
-  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
-       Ptr != End; ) {
-    unsigned CharSize;
-    Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts));
-    Ptr += CharSize;
-  }
-  assert(Result.size() != unsigned(Tok.getLength()) &&
-         "NeedsCleaning flag set on something that didn't need cleaning!");
+  Result.resize(Tok.getLength());
+  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
    return Result;
  }
  
@@ -365,17 +400,7 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
    }
  
    // Otherwise, hard case, relex the characters into the string.
-  char *OutBuf = const_cast<char*>(Buffer);
-  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
-       Ptr != End; ) {
-    unsigned CharSize;
-    *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts);
-    Ptr += CharSize;
-  }
-  assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
-         "NeedsCleaning flag set on something that didn't need cleaning!");
-
-  return OutBuf-Buffer;
+  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
  }
  
  
diff --git a/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/test/CXX/lex/lex.literal/lex.ext/p5.cpp

index 4655aa17dc22247213ef10b99fa0b83bd61d89bb..06c091d8acae19e5ba67b6440a5aaef73fdf03ab 100644 (file)
--- a/test/CXX/lex/lex.literal/lex.ext/p5.cpp
+++ b/test/CXX/lex/lex.literal/lex.ext/p5.cpp
@@ -11,3 +11,10 @@ double &i3 = L"foo"_x1; // expected-error {{no matching literal operator}}
  char &operator "" _x1(const wchar_t *, size_t);
  char &i4 = L"foo"_x1; // ok
  double &i5 = R"(foo)"_x1; // ok
+double &i6 = u\
+8\
+R\
+"(foo)"\
+_\
+x\
+1; // ok
diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c

index 12d431a45434b5590431bd2cae103ddcc64fdac7..962b19d3dd08bf9d2cc5974b68a5d864020f8bc8 100644 (file)
--- a/test/CodeGen/string-literal.c
+++ b/test/CodeGen/string-literal.c
@@ -76,5 +76,12 @@ def)";
    const char *q = R"(abc
  def)" "ghi";
  
+  // CHECK-CPP0X: private unnamed_addr constant [13 x i8] c"abc\5C\0A??=\0Adef\00", align 1
+  const char *r = R\
+"(abc\
+??=
+def)";
+
+
  #endif
  }
author	Richard Smith <richard-llvm@metafoo.co.uk>
	Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)
committer	Richard Smith <richard-llvm@metafoo.co.uk>
	Wed, 28 Nov 2012 07:29:00 +0000 (07:29 +0000)
lib/Lex/Lexer.cpp		patch \| blob \| history
test/CXX/lex/lex.literal/lex.ext/p5.cpp		patch \| blob \| history
test/CodeGen/string-literal.c		patch \| blob \| history