From 8ab09da1faaa33b9fa78de59cc4e191bfe9907b5 Mon Sep 17 00:00:00 2001
From: Richard Trieu <rtrieu@google.com>
Date: Wed, 13 Jun 2012 20:25:24 +0000
Subject: [PATCH] Moved the StringLiteral printing code from StmtPrinter into
 the StringLiteral class and have StmtPrinter and StmtDumper refer to it. 
 This fixes an assertion failure when dumping wchar string literals.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@158417 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/clang/AST/Expr.h     |  2 +
 lib/AST/Expr.cpp             | 93 ++++++++++++++++++++++++++++++++++++
 lib/AST/StmtDumper.cpp       | 12 +----
 lib/AST/StmtPrinter.cpp      | 88 +---------------------------------
 test/Misc/ast-dump-wchar.cpp | 13 +++++
 5 files changed, 110 insertions(+), 98 deletions(-)
 create mode 100644 test/Misc/ast-dump-wchar.cpp

diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h
index 40da0705f3..371263296f 100644
--- a/include/clang/AST/Expr.h
+++ b/include/clang/AST/Expr.h
@@ -1399,6 +1399,8 @@ public:
                      getByteLength());
   }
 
+  void outputString(raw_ostream &OS);
+
   uint32_t getCodeUnit(size_t i) const {
     assert(i < Length && "out of bounds access");
     if (CharByteWidth == 1)
diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp
index d3f6a521f1..5bbf7503f8 100644
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp
@@ -633,6 +633,99 @@ StringLiteral *StringLiteral::CreateEmpty(ASTContext &C, unsigned NumStrs) {
   return SL;
 }
 
+void StringLiteral::outputString(raw_ostream &OS) {
+  switch (getKind()) {
+  case Ascii: break; // no prefix.
+  case Wide:  OS << 'L'; break;
+  case UTF8:  OS << "u8"; break;
+  case UTF16: OS << 'u'; break;
+  case UTF32: OS << 'U'; break;
+  }
+  OS << '"';
+  static const char Hex[] = "0123456789ABCDEF";
+
+  unsigned LastSlashX = getLength();
+  for (unsigned I = 0, N = getLength(); I != N; ++I) {
+    switch (uint32_t Char = getCodeUnit(I)) {
+    default:
+      // FIXME: Convert UTF-8 back to codepoints before rendering.
+
+      // Convert UTF-16 surrogate pairs back to codepoints before rendering.
+      // Leave invalid surrogates alone; we'll use \x for those.
+      if (getKind() == UTF16 && I != N - 1 && Char >= 0xd800 && 
+          Char <= 0xdbff) {
+        uint32_t Trail = getCodeUnit(I + 1);
+        if (Trail >= 0xdc00 && Trail <= 0xdfff) {
+          Char = 0x10000 + ((Char - 0xd800) << 10) + (Trail - 0xdc00);
+          ++I;
+        }
+      }
+
+      if (Char > 0xff) {
+        // If this is a wide string, output characters over 0xff using \x
+        // escapes. Otherwise, this is a UTF-16 or UTF-32 string, and Char is a
+        // codepoint: use \x escapes for invalid codepoints.
+        if (getKind() == Wide ||
+            (Char >= 0xd800 && Char <= 0xdfff) || Char >= 0x110000) {
+          // FIXME: Is this the best way to print wchar_t?
+          OS << "\\x";
+          int Shift = 28;
+          while ((Char >> Shift) == 0)
+            Shift -= 4;
+          for (/**/; Shift >= 0; Shift -= 4)
+            OS << Hex[(Char >> Shift) & 15];
+          LastSlashX = I;
+          break;
+        }
+
+        if (Char > 0xffff)
+          OS << "\\U00"
+             << Hex[(Char >> 20) & 15]
+             << Hex[(Char >> 16) & 15];
+        else
+          OS << "\\u";
+        OS << Hex[(Char >> 12) & 15]
+           << Hex[(Char >>  8) & 15]
+           << Hex[(Char >>  4) & 15]
+           << Hex[(Char >>  0) & 15];
+        break;
+      }
+
+      // If we used \x... for the previous character, and this character is a
+      // hexadecimal digit, prevent it being slurped as part of the \x.
+      if (LastSlashX + 1 == I) {
+        switch (Char) {
+          case '0': case '1': case '2': case '3': case '4':
+          case '5': case '6': case '7': case '8': case '9':
+          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+            OS << "\"\"";
+        }
+      }
+
+      assert(Char <= 0xff &&
+             "Characters above 0xff should already have been handled.");
+
+      if (isprint(Char))
+        OS << (char)Char;
+      else  // Output anything hard as an octal escape.
+        OS << '\\'
+           << (char)('0' + ((Char >> 6) & 7))
+           << (char)('0' + ((Char >> 3) & 7))
+           << (char)('0' + ((Char >> 0) & 7));
+      break;
+    // Handle some common non-printable cases to make dumps prettier.
+    case '\\': OS << "\\\\"; break;
+    case '"': OS << "\\\""; break;
+    case '\n': OS << "\\n"; break;
+    case '\t': OS << "\\t"; break;
+    case '\a': OS << "\\a"; break;
+    case '\b': OS << "\\b"; break;
+    }
+  }
+  OS << '"';
+}
+
 void StringLiteral::setString(ASTContext &C, StringRef Str,
                               StringKind Kind, bool IsPascal) {
   //FIXME: we assume that the string data comes from a target that uses the same
diff --git a/lib/AST/StmtDumper.cpp b/lib/AST/StmtDumper.cpp
index df0052760b..a57cce8371 100644
--- a/lib/AST/StmtDumper.cpp
+++ b/lib/AST/StmtDumper.cpp
@@ -446,18 +446,8 @@ void StmtDumper::VisitFloatingLiteral(FloatingLiteral *Node) {
 
 void StmtDumper::VisitStringLiteral(StringLiteral *Str) {
   DumpExpr(Str);
-  // FIXME: this doesn't print wstrings right.
   OS << " ";
-  switch (Str->getKind()) {
-  case StringLiteral::Ascii: break; // No prefix
-  case StringLiteral::Wide:  OS << 'L'; break;
-  case StringLiteral::UTF8:  OS << "u8"; break;
-  case StringLiteral::UTF16: OS << 'u'; break;
-  case StringLiteral::UTF32: OS << 'U'; break;
-  }
-  OS << '"';
-  OS.write_escaped(Str->getString());
-  OS << '"';
+  Str->outputString(OS);
 }
 
 void StmtDumper::VisitUnaryOperator(UnaryOperator *Node) {
diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index 30548aea60..cb757cdde1 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp
@@ -739,93 +739,7 @@ void StmtPrinter::VisitImaginaryLiteral(ImaginaryLiteral *Node) {
 }
 
 void StmtPrinter::VisitStringLiteral(StringLiteral *Str) {
-  switch (Str->getKind()) {
-  case StringLiteral::Ascii: break; // no prefix.
-  case StringLiteral::Wide:  OS << 'L'; break;
-  case StringLiteral::UTF8:  OS << "u8"; break;
-  case StringLiteral::UTF16: OS << 'u'; break;
-  case StringLiteral::UTF32: OS << 'U'; break;
-  }
-  OS << '"';
-  static const char Hex[] = "0123456789ABCDEF";
-
-  unsigned LastSlashX = Str->getLength();
-  for (unsigned I = 0, N = Str->getLength(); I != N; ++I) {
-    switch (uint32_t Char = Str->getCodeUnit(I)) {
-    default:
-      // FIXME: Convert UTF-8 back to codepoints before rendering.
-
-      // Convert UTF-16 surrogate pairs back to codepoints before rendering.
-      // Leave invalid surrogates alone; we'll use \x for those.
-      if (Str->getKind() == StringLiteral::UTF16 && I != N - 1 &&
-          Char >= 0xd800 && Char <= 0xdbff) {
-        uint32_t Trail = Str->getCodeUnit(I + 1);
-        if (Trail >= 0xdc00 && Trail <= 0xdfff) {
-          Char = 0x10000 + ((Char - 0xd800) << 10) + (Trail - 0xdc00);
-          ++I;
-        }
-      }
-
-      if (Char > 0xff) {
-        // If this is a wide string, output characters over 0xff using \x
-        // escapes. Otherwise, this is a UTF-16 or UTF-32 string, and Char is a
-        // codepoint: use \x escapes for invalid codepoints.
-        if (Str->getKind() == StringLiteral::Wide ||
-            (Char >= 0xd800 && Char <= 0xdfff) || Char >= 0x110000) {
-          // FIXME: Is this the best way to print wchar_t?
-          OS << "\\x";
-          int Shift = 28;
-          while ((Char >> Shift) == 0)
-            Shift -= 4;
-          for (/**/; Shift >= 0; Shift -= 4)
-            OS << Hex[(Char >> Shift) & 15];
-          LastSlashX = I;
-          break;
-        }
-
-        if (Char > 0xffff)
-          OS << "\\U00"
-             << Hex[(Char >> 20) & 15]
-             << Hex[(Char >> 16) & 15];
-        else
-          OS << "\\u";
-        OS << Hex[(Char >> 12) & 15]
-           << Hex[(Char >>  8) & 15]
-           << Hex[(Char >>  4) & 15]
-           << Hex[(Char >>  0) & 15];
-        break;
-      }
-
-      // If we used \x... for the previous character, and this character is a
-      // hexadecimal digit, prevent it being slurped as part of the \x.
-      if (LastSlashX + 1 == I) {
-        switch (Char) {
-          case '0': case '1': case '2': case '3': case '4':
-          case '5': case '6': case '7': case '8': case '9':
-          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-            OS << "\"\"";
-        }
-      }
-
-      if (Char <= 0xff && isprint(Char))
-        OS << (char)Char;
-      else  // Output anything hard as an octal escape.
-        OS << '\\'
-           << (char)('0' + ((Char >> 6) & 7))
-           << (char)('0' + ((Char >> 3) & 7))
-           << (char)('0' + ((Char >> 0) & 7));
-      break;
-    // Handle some common non-printable cases to make dumps prettier.
-    case '\\': OS << "\\\\"; break;
-    case '"': OS << "\\\""; break;
-    case '\n': OS << "\\n"; break;
-    case '\t': OS << "\\t"; break;
-    case '\a': OS << "\\a"; break;
-    case '\b': OS << "\\b"; break;
-    }
-  }
-  OS << '"';
+  Str->outputString(OS);
 }
 void StmtPrinter::VisitParenExpr(ParenExpr *Node) {
   OS << "(";
diff --git a/test/Misc/ast-dump-wchar.cpp b/test/Misc/ast-dump-wchar.cpp
new file mode 100644
index 0000000000..4153706bd6
--- /dev/null
+++ b/test/Misc/ast-dump-wchar.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -std=c++11 -ast-dump %s 2>&1 | FileCheck %s 
+
+char c8[] = u8"test\0\\\"\t\a\b\234";
+// CHECK: char c8[12] = (StringLiteral {{.*}} lvalue u8"test\000\\\"\t\a\b\234")
+
+char16_t c16[] = u"test\0\\\"\t\a\b\234\u1234";
+// CHECK: char16_t c16[13] = (StringLiteral {{.*}} lvalue u"test\000\\\"\t\a\b\234\u1234")
+
+char32_t c32[] = U"test\0\\\"\t\a\b\234\u1234\U0010ffff"; // \
+// CHECK: char32_t c32[14] = (StringLiteral {{.*}} lvalue U"test\000\\\"\t\a\b\234\u1234\U0010FFFF")
+
+wchar_t wc[] = L"test\0\\\"\t\a\b\234\u1234\xffffffff"; // \
+// CHECK: wchar_t wc[14] = (StringLiteral {{.*}} lvalue L"test\000\\\"\t\a\b\234\x1234\xFFFFFFFF")
-- 
2.50.1