Add support for C++0x raw string literals.

author Craig Topper <craig.topper@gmail.com>

Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)

committer Craig Topper <craig.topper@gmail.com>

Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)
author Craig Topper <craig.topper@gmail.com>
Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)
committer Craig Topper <craig.topper@gmail.com>
Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td

index e23921be0bfc849cfe0aeba964af31fd059b75db..1347ceb8c9d2a0a94898dc8bf05c65ca96098080 100644 (file)
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -55,6 +55,15 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">;
  
  def err_conflict_marker : Error<"version control conflict marker in file">;
  
+def err_raw_delim_too_long : Error<
+  "raw string delimiter longer than 16 characters"
+  "; use PREFIX( )PREFIX to delimit raw string">;
+def err_invalid_char_raw_delim : Error<
+  "invalid character '%0' character in raw string delimiter"
+  "; use PREFIX( )PREFIX to delimit raw string">;
+def err_unterminated_raw_string : Error<
+  "raw string missing terminating delimiter )%0\"">;
+
  def ext_multichar_character_literal : ExtWarn<
    "multi-character character constant">, InGroup<MultiChar>;
  def ext_four_char_character_literal : Extension<
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h

index e24fe9c9ab0fbb5786d26a7a0095005af1f49d37..3bc44b192acc75955ecee667c89eb90ad601dd5d 100644 (file)
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -485,6 +485,8 @@ private:
    void LexNumericConstant    (Token &Result, const char *CurPtr);
    void LexStringLiteral      (Token &Result, const char *CurPtr,
                                tok::TokenKind Kind);
+  void LexRawStringLiteral   (Token &Result, const char *CurPtr,
+                              tok::TokenKind Kind);
    void LexAngledStringLiteral(Token &Result, const char *CurPtr);
    void LexCharConstant       (Token &Result, const char *CurPtr,
                                tok::TokenKind Kind);
diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h

index 15057299b2a3bea65a594bfd181507cee5703de7..3a3782a3b257cf0f196a1f8e13728f9a630f097e 100644 (file)
--- a/include/clang/Lex/LiteralSupport.h
+++ b/include/clang/Lex/LiteralSupport.h
@@ -197,6 +197,7 @@ public:
  
  private:
    void init(const Token *StringToks, unsigned NumStringToks);
+  void CopyStringFragment(const StringRef &Fragment);
  };
  
  }  // end namespace clang
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp

index 0664cbc21b81b9ebbcfa5e51a98680bf13d9ac3b..0c32c8d9bafa8d1f86436a1d261b73144c064df1 100644 (file)
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -33,6 +33,7 @@
  #include "llvm/Support/Compiler.h"
  #include "llvm/Support/MemoryBuffer.h"
  #include <cctype>
+#include <cstring>
  using namespace clang;
  
  static void InitCharacterInfo();
@@ -760,7 +761,8 @@ enum {
    CHAR_LETTER   = 0x04,  // a-z,A-Z
    CHAR_NUMBER   = 0x08,  // 0-9
    CHAR_UNDER    = 0x10,  // _
-  CHAR_PERIOD   = 0x20   // .
+  CHAR_PERIOD   = 0x20,  // .
+  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
  };
  
  // Statically initialize CharInfo table based on ASCII character set
@@ -785,20 +787,20 @@ static const unsigned char CharInfo[256] =
     0           , 0           , 0           , 0           ,
  //32 SP         33  !         34  "         35  #
  //36  $         37  %         38  &         39  '
-   CHAR_HORZ_WS, 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
+   CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
  //40  (         41  )         42  *         43  +
  //44  ,         45  -         46  .         47  /
-   0           , 0           , 0           , 0           ,
-   0           , 0           , CHAR_PERIOD , 0           ,
+   0           , 0           , CHAR_RAWDEL , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
  //48  0         49  1         50  2         51  3
  //52  4         53  5         54  6         55  7
     CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
     CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
  //56  8         57  9         58  :         59  ;
  //60  <         61  =         62  >         63  ?
-   CHAR_NUMBER , CHAR_NUMBER , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
+   CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
  //64  @         65  A         66  B         67  C
  //68  D         69  E         70  F         71  G
     0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -813,8 +815,8 @@ static const unsigned char CharInfo[256] =
     CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
  //88  X         89  Y         90  Z         91  [
  //92  \         93  ]         94  ^         95  _
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
-   0           , 0           , 0           , CHAR_UNDER  ,
+   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
  //96  `         97  a         98  b         99  c
  //100  d       101  e        102  f        103  g
     0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -828,9 +830,9 @@ static const unsigned char CharInfo[256] =
     CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
     CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
  //120  x       121  y        122  z        123  {
-//124  |        125  }        126  ~        127 DEL
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
-   0           , 0           , 0           , 0
+//124  |       125  }        126  ~        127 DEL
+   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
  };
  
  static void InitCharacterInfo() {
@@ -888,6 +890,14 @@ static inline bool isNumberBody(unsigned char c) {
      true : false;
  }
  
+/// isRawStringDelimBody - Return true if this is the body character of a
+/// raw string delimiter.
+static inline bool isRawStringDelimBody(unsigned char c) {
+  return (CharInfo[c] &
+          (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
+    true : false;
+}
+
  
  //===----------------------------------------------------------------------===//
  // Diagnostics forwarding code.
@@ -1363,6 +1373,78 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
    Result.setLiteralData(TokStart);
  }
  
+/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
+/// having lexed R", LR", u8R", uR", or UR".
+void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
+                                tok::TokenKind Kind) {
+  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
+  //  Between the initial and final double quote characters of the raw string,
+  //  any transformations performed in phases 1 and 2 (trigraphs,
+  //  universal-character-names, and line splicing) are reverted.
+
+  unsigned PrefixLen = 0;
+
+  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
+    ++PrefixLen;
+
+  // If the last character was not a '(', then we didn't lex a valid delimiter.
+  if (CurPtr[PrefixLen] != '(') {
+    if (!isLexingRawMode()) {
+      const char *PrefixEnd = &CurPtr[PrefixLen];
+      if (PrefixLen == 16) {
+        Diag(PrefixEnd, diag::err_raw_delim_too_long);
+      } else {
+        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
+          << StringRef(PrefixEnd, 1);
+      }
+    }
+
+    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
+    // it's possible the '"' was intended to be part of the raw string, but
+    // there's not much we can do about that.
+    while (1) {
+      char C = *CurPtr++;
+
+      if (C == '"')
+        break;
+      if (C == 0 && CurPtr-1 == BufferEnd) {
+        --CurPtr;
+        break;
+      }
+    }
+
+    FormTokenWithChars(Result, CurPtr, tok::unknown);
+    return;
+  }
+
+  // Save prefix and move CurPtr past it
+  const char *Prefix = CurPtr;
+  CurPtr += PrefixLen + 1; // skip over prefix and '('
+
+  while (1) {
+    char C = *CurPtr++;
+
+    if (C == ')') {
+      // Check for prefix match and closing quote.
+      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
+        CurPtr += PrefixLen + 1; // skip over prefix and '"'
+        break;
+      }
+    } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
+      if (!isLexingRawMode())
+        Diag(BufferPtr, diag::err_unterminated_raw_string)
+          << StringRef(Prefix, PrefixLen);
+      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
+      return;
+    }
+  }
+
+  // Update the location of token as well as BufferPtr.
+  const char *TokStart = BufferPtr;
+  FormTokenWithChars(Result, CurPtr, Kind);
+  Result.setLiteralData(TokStart);
+}
+
  /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
  /// after having lexed the '<' character.  This is used for #include filenames.
  void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
@@ -2262,12 +2344,36 @@ LexNextToken:
          return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                                 tok::utf16_char_constant);
  
-      // UTF-8 string literal
-      if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
-        return LexStringLiteral(Result,
-                              ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
-                                          SizeTmp2, Result),
-                              tok::utf8_string_literal);
+      // UTF-16 raw string literal
+      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+        return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf16_string_literal);
+
+      if (Char == '8') {
+        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
+
+        // UTF-8 string literal
+        if (Char2 == '"')
+          return LexStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf8_string_literal);
+
+        if (Char2 == 'R') {
+          unsigned SizeTmp3;
+          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
+          // UTF-8 raw string literal
+          if (Char3 == '"') {
+            return LexRawStringLiteral(Result,
+                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               SizeTmp3, Result),
+                   tok::utf8_string_literal);
+          }
+        }
+      }
      }
  
      // treat u like the start of an identifier.
@@ -2289,11 +2395,34 @@ LexNextToken:
        if (Char == '\'')
          return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                                 tok::utf32_char_constant);
+
+      // UTF-32 raw string literal
+      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+        return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf32_string_literal);
      }
  
      // treat U like the start of an identifier.
      return LexIdentifier(Result, CurPtr);
  
+  case 'R': // Identifier or C++0x raw string literal
+    // Notify MIOpt that we read a non-whitespace/non-comment token.
+    MIOpt.ReadToken();
+
+    if (Features.CPlusPlus0x) {
+      Char = getCharAndSize(CurPtr, SizeTmp);
+
+      if (Char == '"')
+        return LexRawStringLiteral(Result,
+                                   ConsumeChar(CurPtr, SizeTmp, Result),
+                                   tok::string_literal);
+    }
+
+    // treat R like the start of an identifier.
+    return LexIdentifier(Result, CurPtr);
+
    case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
      // Notify MIOpt that we read a non-whitespace/non-comment token.
      MIOpt.ReadToken();
@@ -2304,6 +2433,14 @@ LexNextToken:
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                                tok::wide_string_literal);
  
+    // Wide raw string literal.
+    if (Features.CPlusPlus0x && Char == 'R' &&
+        getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+      return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::wide_string_literal);
+
      // Wide character constant.
      if (Char == '\'')
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
@@ -2313,7 +2450,7 @@ LexNextToken:
    // C99 6.4.2: Identifiers.
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
    case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
-  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':    /*'U'*/
+  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
    case 'V': case 'W': case 'X': case 'Y': case 'Z':
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index a40908bd9f0d7ae3c811df374ecf95e238be96f4..c74b1466f3a83c812e038f03e1d77ae6c50e7317 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -713,6 +713,38 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
  }
  
  
+///       character-literal: [C++0x lex.ccon]
+///         ' c-char-sequence '
+///         u' c-char-sequence '
+///         U' c-char-sequence '
+///         L' c-char-sequence '
+///       c-char-sequence:
+///         c-char
+///         c-char-sequence c-char
+///       c-char:
+///         any member of the source character set except the single-quote ',
+///           backslash \, or new-line character
+///         escape-sequence
+///         universal-character-name
+///       escape-sequence: [C++0x lex.ccon]
+///         simple-escape-sequence
+///         octal-escape-sequence
+///         hexadecimal-escape-sequence
+///       simple-escape-sequence:
+///         one of \’ \" \? \\ \a \b \f \n \r \t \v
+///       octal-escape-sequence:
+///         \ octal-digit
+///         \ octal-digit octal-digit
+///         \ octal-digit octal-digit octal-digit
+///       hexadecimal-escape-sequence:
+///         \x hexadecimal-digit
+///         hexadecimal-escape-sequence hexadecimal-digit
+///       universal-character-name:
+///         \u hex-quad
+///         \U hex-quad hex-quad
+///       hex-quad:
+///         hex-digit hex-digit hex-digit hex-digit
+///
  CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
                                       SourceLocation Loc, Preprocessor &PP,
                                       tok::TokenKind kind) {
@@ -825,34 +857,52 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
  }
  
  
-///       string-literal: [C99 6.4.5]
-///          " [s-char-sequence] "
-///         L" [s-char-sequence] "
+///       string-literal: [C++0x lex.string]
+///         encoding-prefix " [s-char-sequence] "
+///         encoding-prefix R raw-string
+///       encoding-prefix:
+///         u8
+///         u
+///         U
+///         L
  ///       s-char-sequence:
  ///         s-char
  ///         s-char-sequence s-char
  ///       s-char:
-///         any source character except the double quote ",
-///           backslash \, or newline character
-///         escape-character
-///         universal-character-name
-///       escape-character: [C99 6.4.4.4]
-///         \ escape-code
+///         any member of the source character set except the double-quote ",
+///           backslash \, or new-line character
+///         escape-sequence
  ///         universal-character-name
-///       escape-code:
-///         character-escape-code
-///         octal-escape-code
-///         hex-escape-code
-///       character-escape-code: one of
-///         n t b r f v a
-///         \ ' " ?
-///       octal-escape-code:
-///         octal-digit
-///         octal-digit octal-digit
-///         octal-digit octal-digit octal-digit
-///       hex-escape-code:
-///         x hex-digit
-///         hex-escape-code hex-digit
+///       raw-string:
+///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
+///       r-char-sequence:
+///         r-char
+///         r-char-sequence r-char
+///       r-char:
+///         any member of the source character set, except a right parenthesis )
+///           followed by the initial d-char-sequence (which may be empty)
+///           followed by a double quote ".
+///       d-char-sequence:
+///         d-char
+///         d-char-sequence d-char
+///       d-char:
+///         any member of the basic source character set except:
+///           space, the left parenthesis (, the right parenthesis ),
+///           the backslash \, and the control characters representing horizontal
+///           tab, vertical tab, form feed, and newline.
+///       escape-sequence: [C++0x lex.ccon]
+///         simple-escape-sequence
+///         octal-escape-sequence
+///         hexadecimal-escape-sequence
+///       simple-escape-sequence:
+///         one of \’ \" \? \\ \a \b \f \n \r \t \v
+///       octal-escape-sequence:
+///         \ octal-digit
+///         \ octal-digit octal-digit
+///         \ octal-digit octal-digit octal-digit
+///       hexadecimal-escape-sequence:
+///         \x hexadecimal-digit
+///         hexadecimal-escape-sequence hexadecimal-digit
  ///       universal-character-name:
  ///         \u hex-quad
  ///         \U hex-quad hex-quad
@@ -972,64 +1022,69 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
          ++ThisTokBuf;
      }
  
-    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
-    ++ThisTokBuf;
+    // Check for raw string
+    if (ThisTokBuf[0] == 'R') {
+      ThisTokBuf += 2; // skip R"
  
-    // Check if this is a pascal string
-    if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
-        ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
-
-      // If the \p sequence is found in the first token, we have a pascal string
-      // Otherwise, if we already have a pascal string, ignore the first \p
-      if (i == 0) {
+      const char *Prefix = ThisTokBuf;
+      while (ThisTokBuf[0] != '(')
          ++ThisTokBuf;
-        Pascal = true;
-      } else if (Pascal)
-        ThisTokBuf += 2;
-    }
+      ++ThisTokBuf; // skip '('
+
+      // remove same number of characters from the end
+      if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
+        ThisTokEnd -= (ThisTokBuf - Prefix);
+
+      // Copy the string over
+      CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
+    } else {
+      assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
+      ++ThisTokBuf; // skip "
+
+      // Check if this is a pascal string
+      if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
+          ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
  
-    while (ThisTokBuf != ThisTokEnd) {
-      // Is this a span of non-escape characters?
-      if (ThisTokBuf[0] != '\\') {
-        const char *InStart = ThisTokBuf;
-        do {
+        // If the \p sequence is found in the first token, we have a pascal string
+        // Otherwise, if we already have a pascal string, ignore the first \p
+        if (i == 0) {
            ++ThisTokBuf;
-        } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
-
-        // Copy the character span over.
-        unsigned Len = ThisTokBuf-InStart;
-        if (CharByteWidth == 1) {
-          memcpy(ResultPtr, InStart, Len);
-          ResultPtr += Len;
-        } else {
-          // Note: our internal rep of wide char tokens is always little-endian.
-          for (; Len; --Len, ++InStart) {
-            *ResultPtr++ = InStart[0];
-            // Add zeros at the end.
-            for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
-              *ResultPtr++ = 0;
-          }
-        }
-        continue;
+          Pascal = true;
+        } else if (Pascal)
+          ThisTokBuf += 2;
        }
-      // Is this a Universal Character Name escape?
-      if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
-        EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
-                        hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
-                        CharByteWidth, Diags, Features);
-        continue;
-      }
-      // Otherwise, this is a non-UCN escape character.  Process it.
-      unsigned ResultChar =
-        ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
-                          FullSourceLoc(StringToks[i].getLocation(), SM),
-                          CharByteWidth*8, Diags);
  
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
+      while (ThisTokBuf != ThisTokEnd) {
+        // Is this a span of non-escape characters?
+        if (ThisTokBuf[0] != '\\') {
+          const char *InStart = ThisTokBuf;
+          do {
+            ++ThisTokBuf;
+          } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+
+          // Copy the character span over.
+          CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
+          continue;
+        }
+        // Is this a Universal Character Name escape?
+        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+          EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+                          hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
+                          CharByteWidth, Diags, Features);
+          continue;
+        }
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned ResultChar =
+          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+                            FullSourceLoc(StringToks[i].getLocation(), SM),
+                            CharByteWidth*8, Diags);
+
+        // Note: our internal rep of wide char tokens is always little-endian.
+        *ResultPtr++ = ResultChar & 0xFF;
  
-      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
-        *ResultPtr++ = ResultChar >> i*8;
+        for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+          *ResultPtr++ = ResultChar >> i*8;
+      }
      }
    }
  
@@ -1062,6 +1117,25 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
  }
  
  
+/// copyStringFragment - This function copies from Start to End into ResultPtr.
+/// Performs widening for multi-byte characters.
+void StringLiteralParser::CopyStringFragment(const StringRef &Fragment) {
+  // Copy the character span over.
+  if (CharByteWidth == 1) {
+    memcpy(ResultPtr, Fragment.data(), Fragment.size());
+    ResultPtr += Fragment.size();
+  } else {
+    // Note: our internal rep of wide char tokens is always little-endian.
+    for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
+      *ResultPtr++ = *I;
+      // Add zeros at the end.
+      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+        *ResultPtr++ = 0;
+    }
+  }
+}
+
+
  /// getOffsetOfStringByte - This function returns the offset of the
  /// specified byte of the string data represented by Token.  This handles
  /// advancing over escape sequences in the string.
diff --git a/lib/Lex/TokenConcatenation.cpp b/lib/Lex/TokenConcatenation.cpp

index 19baf80aad37f1f7af809b85110a9b53dac97738..d6f3bc493fc51b0460b180f0554c6ffdc4bf24fd 100644 (file)
--- a/lib/Lex/TokenConcatenation.cpp
+++ b/lib/Lex/TokenConcatenation.cpp
@@ -17,39 +17,53 @@
  using namespace clang;
  
  
+/// IsStringPrefix - Return true if Str is a string prefix.
+/// 'L', 'u', 'U', or 'u8'. Including raw versions.
+static bool IsStringPrefix(const StringRef &Str, bool CPlusPlus0x) {
+
+  if (Str[0] == 'L' ||
+      (CPlusPlus0x && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) {
+
+    if (Str.size() == 1)
+      return true; // "L", "u", "U", and "R"
+
+    // Check for raw flavors. Need to make sure the first character wasn't
+    // already R. Need CPlusPlus0x check for "LR".
+    if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus0x)
+      return true; // "LR", "uR", "UR"
+
+    // Check for "u8" and "u8R"
+    if (Str[0] == 'u' && Str[1] == '8') {
+      if (Str.size() == 2) return true; // "u8"
+      if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R"
+    }
+  }
+
+  return false;
+}
+
  /// IsIdentifierStringPrefix - Return true if the spelling of the token
-/// is literally 'L', 'u', 'U', or 'u8'.
+/// is literally 'L', 'u', 'U', or 'u8'. Including raw versions.
  bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
    const LangOptions &LangOpts = PP.getLangOptions();
  
    if (!Tok.needsCleaning()) {
-    if (Tok.getLength() != 1 && Tok.getLength() != 2)
+    if (Tok.getLength() < 1 || Tok.getLength() > 3)
        return false;
      SourceManager &SM = PP.getSourceManager();
      const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
-    if (Tok.getLength() == 1)
-      return Ptr[0] == 'L' ||
-             (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U'));
-    if (Tok.getLength() == 2)
-      return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8';
+    return IsStringPrefix(StringRef(Ptr, Tok.getLength()),
+                          LangOpts.CPlusPlus0x);
    }
  
    if (Tok.getLength() < 256) {
      char Buffer[256];
      const char *TokPtr = Buffer;
      unsigned length = PP.getSpelling(Tok, TokPtr);
-    if (length == 1)
-      return TokPtr[0] == 'L' ||
-             (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U'));
-    if (length == 2)
-      return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8';
-    return false;
+    return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus0x);
    }
  
-  std::string TokStr = PP.getSpelling(Tok);
-  return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" ||
-                                                    TokStr == "u" ||
-                                                    TokStr == "U"));
+  return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus0x);
  }
  
  TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c

index dfa609fe8a399a36367ff110fadf26f1c964e4b7..98216423bc420d31fd3408ede6fd33745e529b55 100644 (file)
--- a/test/CodeGen/string-literal.c
+++ b/test/CodeGen/string-literal.c
@@ -1,6 +1,6 @@
  // RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s
  // RUN: %clang_cc1 -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s
-// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=C %s
+// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=CPP0X %s
  
  #include <stddef.h>
  
@@ -38,5 +38,28 @@ int main() {
  
    // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"def\00", align 1
    const char *g = u8"def";
+
+  // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"ghi\00", align 1
+  const char *h = R"foo(ghi)foo";
+
+  // CHECK-CPP0X: private unnamed_addr constant [4 x i8] c"jkl\00", align 1
+  const char *i = u8R"bar(jkl)bar";
+
+  // CHECK-CPP0X: private unnamed_addr constant [6 x i8] c"G\00H\00\00\00", align 2
+  const char16_t *j = uR"foo(GH)foo";
+
+  // CHECK-CPP0X: private unnamed_addr constant [12 x i8] c"I\00\00\00J\00\00\00\00\00\00\00", align 4
+  const char32_t *k = UR"bar(IJ)bar";
+
+  // CHECK-CPP0X: private unnamed_addr constant [12 x i8] c"K\00\00\00L\00\00\00\00\00\00\00", align 4
+  const wchar_t *l = LR"bar(KL)bar";
+
+  // CHECK-CPP0X: private unnamed_addr constant [9 x i8] c"abc\5Cndef\00", align 1
+  const char *m = R"(abc\ndef)";
+
+  // CHECK-CPP0X: private unnamed_addr constant [8 x i8] c"abc\0Adef\00", align 1
+  const char *n = R"(abc
+def)";
+
  #endif
  }
diff --git a/test/Lexer/cxx0x_raw_string_delim_length.cpp b/test/Lexer/cxx0x_raw_string_delim_length.cpp

new file mode 100644 (file)

index 0000000..c7b32f8
--- /dev/null
+++ b/test/Lexer/cxx0x_raw_string_delim_length.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string delimiter longer than 16 characters'
+
+const char *str = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz";
+// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string delimiter longer than 16 characters'
+
+const char *str = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz";
diff --git a/test/Lexer/cxx0x_raw_string_unterminated.cpp b/test/Lexer/cxx0x_raw_string_unterminated.cpp

new file mode 100644 (file)

index 0000000..7813c99
--- /dev/null
+++ b/test/Lexer/cxx0x_raw_string_unterminated.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string missing terminating delimiter )foo"'
+
+const char *str = R"foo(abc
+def)bar";
+// RUN: %clang_cc1 -std=c++0x -E %s 2>&1 | grep 'error: raw string missing terminating delimiter )foo"'
+
+const char *str = R"foo(abc
+def)bar";
diff --git a/test/SemaCXX/cxx0x-type-convert-construct.cpp b/test/SemaCXX/cxx0x-type-convert-construct.cpp

index a523108c6e20afb8dbe6bc3ef4bf011cc63e5d14..f32c8e2014b36fa1c76ad2f9d2308332dc195793 100644 (file)
--- a/test/SemaCXX/cxx0x-type-convert-construct.cpp
+++ b/test/SemaCXX/cxx0x-type-convert-construct.cpp
@@ -7,4 +7,15 @@ void f() {
    ustr = u"a UTF-16 string"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [16]'}}
    char32_t *Ustr;
    Ustr = U"a UTF-32 string"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [16]'}}
+
+  char *Rstr;
+  Rstr = "a raw string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
+  wchar_t *LRstr;
+  LRstr = LR"foo(a wide raw string)foo"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}}
+  char *u8Rstr;
+  u8Rstr = u8R"foo(a UTF-8 raw string)foo"; // expected-error {{assigning to 'char *' from incompatible type 'const char [19]'}}
+  char16_t *uRstr;
+  uRstr = uR"foo(a UTF-16 raw string)foo"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [20]'}}
+  char32_t *URstr;
+  URstr = UR"foo(a UTF-32 raw string)foo"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [20]'}}
  }
author	Craig Topper <craig.topper@gmail.com>
	Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)
committer	Craig Topper <craig.topper@gmail.com>
	Thu, 11 Aug 2011 04:06:15 +0000 (04:06 +0000)
include/clang/Basic/DiagnosticLexKinds.td		patch \| blob \| history
include/clang/Lex/Lexer.h		patch \| blob \| history
include/clang/Lex/LiteralSupport.h		patch \| blob \| history
lib/Lex/Lexer.cpp		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
lib/Lex/TokenConcatenation.cpp		patch \| blob \| history
test/CodeGen/string-literal.c		patch \| blob \| history
test/Lexer/cxx0x_raw_string_delim_length.cpp	[new file with mode: 0644]	patch \| blob
test/Lexer/cxx0x_raw_string_unterminated.cpp	[new file with mode: 0644]	patch \| blob
test/SemaCXX/cxx0x-type-convert-construct.cpp		patch \| blob \| history