Implement resolving of HTML character references (named: &, decimal: *,

author Dmitri Gribenko <gribozavr@gmail.com>

Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)

committer Dmitri Gribenko <gribozavr@gmail.com>

Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)
author Dmitri Gribenko <gribozavr@gmail.com>
Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)
committer Dmitri Gribenko <gribozavr@gmail.com>
Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h

index dc014fdb250aa714e56fc0626c4460f349a5dd16..5b69a95ee0601f0610314b6878abed99a158de05 100644 (file)
--- a/include/clang/AST/CommentLexer.h
+++ b/include/clang/AST/CommentLexer.h
@@ -211,6 +211,10 @@ private:
    Lexer(const Lexer&);          // DO NOT IMPLEMENT
    void operator=(const Lexer&); // DO NOT IMPLEMENT
  
+  /// Allocator for strings that are semantic values of tokens and have to be
+  /// computed (for example, resolved decimal character references).
+  llvm::BumpPtrAllocator &Allocator;
+
    const char *const BufferStart;
    const char *const BufferEnd;
    SourceLocation FileLoc;
@@ -289,6 +293,16 @@ private:
  
    bool isVerbatimLineCommand(StringRef Name) const;
  
+  /// Given a character reference name (e.g., "lt"), return the character that
+  /// it stands for (e.g., "<").
+  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
+
+  /// Given a Unicode codepoint as base-10 integer, return the character.
+  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
+
+  /// Given a Unicode codepoint as base-16 integer, return the character.
+  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
+
    void formTokenWithChars(Token &Result, const char *TokEnd,
                            tok::TokenKind Kind) {
      const unsigned TokLen = TokEnd - BufferPtr;
@@ -302,6 +316,12 @@ private:
      BufferPtr = TokEnd;
    }
  
+  void formTextToken(Token &Result, const char *TokEnd) {
+    StringRef Text(BufferPtr, TokEnd - BufferPtr);
+    formTokenWithChars(Result, TokEnd, tok::text);
+    Result.setText(Text);
+  }
+
    SourceLocation getSourceLocation(const char *Loc) const {
      assert(Loc >= BufferStart && Loc <= BufferEnd &&
             "Location out of range for this buffer!");
@@ -328,6 +348,8 @@ private:
  
    void lexVerbatimLineText(Token &T);
  
+  void lexHTMLCharacterReference(Token &T);
+
    void setupAndLexHTMLStartTag(Token &T);
  
    void lexHTMLStartTag(Token &T);
@@ -337,7 +359,8 @@ private:
    void lexHTMLEndTag(Token &T);
  
  public:
-  Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
+  Lexer(llvm::BumpPtrAllocator &Allocator,
+        SourceLocation FileLoc, const CommentOptions &CommOpts,
          const char *BufferStart, const char *BufferEnd);
  
    void lex(Token &T);
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp

index 5b57ce45577e0b48931163bc0c65178234e8e683..46a4d87f9a9b89a3ee5c41dc84ba4bf21d827834 100644 (file)
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -216,7 +216,8 @@ comments::FullComment *ASTContext::getCommentForDecl(const Decl *D) const {
      return NULL;
  
    const StringRef RawText = RC->getRawText(SourceMgr);
-  comments::Lexer L(RC->getSourceRange().getBegin(), comments::CommentOptions(),
+  comments::Lexer L(getAllocator(),
+                    RC->getSourceRange().getBegin(), comments::CommentOptions(),
                      RawText.begin(), RawText.end());
  
    comments::Sema S(getAllocator(), getSourceManager(), getDiagnostics());
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp

index 31468321cf4036b23731e300438262a98d841e05..dde484510ff180acc4c9bd485ddaa6278c2d7b75 100644 (file)
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -1,4 +1,5 @@
  #include "clang/AST/CommentLexer.h"
+#include "clang/Basic/ConvertUTF.h"
  #include "llvm/ADT/StringSwitch.h"
  #include "llvm/Support/ErrorHandling.h"
  
@@ -87,6 +88,71 @@ bool Lexer::isVerbatimLineCommand(StringRef Name) const {
    return false;
  }
  
+namespace {
+bool isHTMLNamedCharacterReferenceCharacter(char C) {
+  return (C >= 'a' && C <= 'z') ||
+         (C >= 'A' && C <= 'Z');
+}
+
+bool isHTMLDecimalCharacterReferenceCharacter(char C) {
+  return C >= '0' && C <= '9';
+}
+
+bool isHTMLHexCharacterReferenceCharacter(char C) {
+  return (C >= '0' && C <= '9') ||
+         (C >= 'a' && C <= 'f') ||
+         (C >= 'A' && C <= 'F');
+}
+} // unnamed namespace
+
+StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
+  return llvm::StringSwitch<StringRef>(Name)
+      .Case("amp", "&")
+      .Case("lt", "<")
+      .Case("gt", ">")
+      .Case("quot", "\"")
+      .Case("apos", "\'")
+      .Default("");
+}
+
+StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
+    CodePoint *= 10;
+    CodePoint += Name[i] - '0';
+  }
+
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    CodePoint *= 16;
+    const char C = Name[i];
+    assert(isHTMLHexCharacterReferenceCharacter(C));
+    if (C >= '0' && C <= '9')
+      CodePoint += Name[i] - '0';
+    else if (C >= 'a' && C <= 'f')
+      CodePoint += Name[i] - 'a' + 10;
+    else
+      CodePoint += Name[i] - 'A' + 10;
+  }
+
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+
  void Lexer::skipLineStartingDecorations() {
    // This function should be called only for C comments
    assert(CommentState == LCS_InsideCComment);
@@ -147,6 +213,33 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    return BufferPtr;
  }
  
+const char *skipNamedCharacterReference(const char *BufferPtr,
+                                        const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
+const char *skipDecimalCharacterReference(const char *BufferPtr,
+                                          const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
+const char *skipHexCharacterReference(const char *BufferPtr,
+                                          const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
  bool isHTMLIdentifierStartingCharacter(char C) {
    return (C >= 'a' && C <= 'z') ||
           (C >= 'A' && C <= 'Z');
@@ -295,9 +388,7 @@ void Lexer::lexCommentText(Token &T) {
        case '@': {
          TokenPtr++;
          if (TokenPtr == CommentEnd) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
            return;
          }
          char C = *TokenPtr;
@@ -322,9 +413,7 @@ void Lexer::lexCommentText(Token &T) {
  
          // Don't make zero-length commands.
          if (!isCommandNameCharacter(*TokenPtr)) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
            return;
          }
  
@@ -357,12 +446,14 @@ void Lexer::lexCommentText(Token &T) {
          return;
        }
  
+      case '&':
+        lexHTMLCharacterReference(T);
+        return;
+
        case '<': {
          TokenPtr++;
          if (TokenPtr == CommentEnd) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
            return;
          }
          const char C = *TokenPtr;
@@ -370,11 +461,9 @@ void Lexer::lexCommentText(Token &T) {
            setupAndLexHTMLStartTag(T);
          else if (C == '/')
            setupAndLexHTMLEndTag(T);
-        else {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
-        }
+        else
+          formTextToken(T, TokenPtr);
+
          return;
        }
  
@@ -394,12 +483,10 @@ void Lexer::lexCommentText(Token &T) {
              break;
            const char C = *TokenPtr;
            if(C == '\n' || C == '\r' ||
-             C == '\\' || C == '@' || C == '<')
+             C == '\\' || C == '@' || C == '&' || C == '<')
              break;
          }
-        StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-        formTokenWithChars(T, TokenPtr, tok::text);
-        T.setText(Text);
+        formTextToken(T, TokenPtr);
          return;
        }
      }
@@ -506,6 +593,69 @@ void Lexer::lexVerbatimLineText(Token &T) {
    State = LS_Normal;
  }
  
+void Lexer::lexHTMLCharacterReference(Token &T) {
+  const char *TokenPtr = BufferPtr;
+  assert(*TokenPtr == '&');
+  TokenPtr++;
+  if (TokenPtr == CommentEnd) {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  const char *NamePtr;
+  bool isNamed = false;
+  bool isDecimal = false;
+  char C = *TokenPtr;
+  if (isHTMLNamedCharacterReferenceCharacter(C)) {
+    NamePtr = TokenPtr;
+    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
+    isNamed = true;
+  } else if (C == '#') {
+    TokenPtr++;
+    if (TokenPtr == CommentEnd) {
+      formTextToken(T, TokenPtr);
+      return;
+    }
+    C = *TokenPtr;
+    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
+      NamePtr = TokenPtr;
+      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
+      isDecimal = true;
+    } else if (C == 'x' || C == 'X') {
+      TokenPtr++;
+      NamePtr = TokenPtr;
+      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
+    } else {
+      formTextToken(T, TokenPtr);
+      return;
+    }
+  } else {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
+      *TokenPtr != ';') {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  StringRef Name(NamePtr, TokenPtr - NamePtr);
+  TokenPtr++; // Skip semicolon.
+  StringRef Resolved;
+  if (isNamed)
+    Resolved = resolveHTMLNamedCharacterReference(Name);
+  else if (isDecimal)
+    Resolved = resolveHTMLDecimalCharacterReference(Name);
+  else
+    Resolved = resolveHTMLHexCharacterReference(Name);
+
+  if (Resolved.empty()) {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  formTokenWithChars(T, TokenPtr, tok::text);
+  T.setText(Resolved);
+  return;
+}
+
  void Lexer::setupAndLexHTMLStartTag(Token &T) {
    assert(BufferPtr[0] == '<' &&
           isHTMLIdentifierStartingCharacter(BufferPtr[1]));
@@ -561,11 +711,9 @@ void Lexer::lexHTMLStartTag(Token &T) {
        if (TokenPtr != CommentEnd && *TokenPtr == '>') {
          TokenPtr++;
          formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
-      } else {
-        StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-        formTokenWithChars(T, TokenPtr, tok::text);
-        T.setText(Text);
-      }
+      } else
+        formTextToken(T, TokenPtr);
+
        State = LS_Normal;
        return;
      }
@@ -609,8 +757,10 @@ void Lexer::lexHTMLEndTag(Token &T) {
    State = LS_Normal;
  }
  
-Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
+Lexer::Lexer(llvm::BumpPtrAllocator &Allocator,
+             SourceLocation FileLoc, const CommentOptions &CommOpts,
               const char *BufferStart, const char *BufferEnd):
+    Allocator(Allocator),
      BufferStart(BufferStart), BufferEnd(BufferEnd),
      FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
      CommentState(LCS_BeforeComment), State(LS_Normal) {
diff --git a/lib/AST/RawCommentList.cpp b/lib/AST/RawCommentList.cpp

index 7e183e2f2d3268dcf7716813d196574098399647..41866cf03f114a1e5a57b8a38fc6f874ddd12dc7 100644 (file)
--- a/lib/AST/RawCommentList.cpp
+++ b/lib/AST/RawCommentList.cpp
@@ -134,7 +134,13 @@ const char *RawComment::extractBriefText(const ASTContext &Context) const {
    // Make sure that RawText is valid.
    getRawText(Context.getSourceManager());
  
-  comments::Lexer L(Range.getBegin(), comments::CommentOptions(),
+  // Since we will be copying the resulting text, all allocations made during
+  // parsing are garbage after resulting string is formed.  Thus we can use
+  // a separate allocator for all temporary stuff.
+  llvm::BumpPtrAllocator Allocator;
+
+  comments::Lexer L(Allocator,
+                    Range.getBegin(), comments::CommentOptions(),
                      RawText.begin(), RawText.end());
    comments::BriefParser P(L);
  
diff --git a/test/Index/annotate-comments.cpp b/test/Index/annotate-comments.cpp

index 926e3d8baee21f453fa7623fff3c0c59a94f365b..5aebb6dee0f765439dc4c7ab59560909ec41e915 100644 (file)
--- a/test/Index/annotate-comments.cpp
+++ b/test/Index/annotate-comments.cpp
@@ -323,6 +323,9 @@ void comment_to_html_conversion_23();
  /// &amp; &lt; &gt; &quot;
  void comment_to_html_conversion_24();
  
+/// <em>0&lt;i</em>
+void comment_to_html_conversion_25();
+
  #endif
  
  // RUN: rm -rf %t
@@ -642,9 +645,26 @@ void comment_to_html_conversion_24();
  // CHECK-NEXT:         (CXComment_Text Text=[.])
  // CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
  // CHECK-NEXT:         (CXComment_Text Text=[::])))]
-// CHECK: annotate-comments.cpp:324:6: FunctionDecl=comment_to_html_conversion_24:{{.*}} FullCommentAsHTML=[<p class="para-brief"> &amp;amp; &amp;lt; &amp;gt; &amp;quot;</p>]
-// CHECK:  CommentAST=[
-// CHECK:    (CXComment_FullComment
-// CHECK:       (CXComment_Paragraph
-// CHECK:         (CXComment_Text Text=[ &amp; &lt; &gt; &quot;])))]
+// CHECK: annotate-comments.cpp:324:6: FunctionDecl=comment_to_html_conversion_24:{{.*}} FullCommentAsHTML=[<p class="para-brief"> &amp; &lt; &gt; &quot;</p>]
+// CHECK-NEXT:  CommentAST=[
+// CHECK-NEXT:    (CXComment_FullComment
+// CHECK-NEXT:       (CXComment_Paragraph
+// CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
+// CHECK-NEXT:         (CXComment_Text Text=[&])
+// CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
+// CHECK-NEXT:         (CXComment_Text Text=[<])
+// CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
+// CHECK-NEXT:         (CXComment_Text Text=[>])
+// CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
+// CHECK-NEXT:         (CXComment_Text Text=["])))]
+// CHECK: annotate-comments.cpp:327:6: FunctionDecl=comment_to_html_conversion_25:{{.*}} FullCommentAsHTML=[<p class="para-brief"> <em>0&lt;i</em></p>]
+// CHECK-NEXT:  CommentAST=[
+// CHECK-NEXT:    (CXComment_FullComment
+// CHECK-NEXT:       (CXComment_Paragraph
+// CHECK-NEXT:         (CXComment_Text Text=[ ] IsWhitespace)
+// CHECK-NEXT:         (CXComment_HTMLStartTag Name=[em])
+// CHECK-NEXT:         (CXComment_Text Text=[0])
+// CHECK-NEXT:         (CXComment_Text Text=[<])
+// CHECK-NEXT:         (CXComment_Text Text=[i])
+// CHECK-NEXT:         (CXComment_HTMLEndTag Name=[em])))]
  
diff --git a/unittests/AST/CommentLexer.cpp b/unittests/AST/CommentLexer.cpp

index dd92df421f22b143c74f7b5bbb101c24e59bc932..8b5d0c8cf015ddf836f7efd9dc2f416a537f2d19 100644 (file)
--- a/unittests/AST/CommentLexer.cpp
+++ b/unittests/AST/CommentLexer.cpp
@@ -37,6 +37,7 @@ protected:
    IntrusiveRefCntPtr<DiagnosticIDs> DiagID;
    DiagnosticsEngine Diags;
    SourceManager SourceMgr;
+  llvm::BumpPtrAllocator Allocator;
  
    void lexString(const char *Source, std::vector<Token> &Toks);
  };
@@ -47,7 +48,7 @@ void CommentLexerTest::lexString(const char *Source,
    FileID File = SourceMgr.createFileIDForMemBuffer(Buf);
    SourceLocation Begin = SourceMgr.getLocForStartOfFile(File);
  
-  comments::Lexer L(Begin, CommentOptions(),
+  comments::Lexer L(Allocator, Begin, CommentOptions(),
                      Source, Source + strlen(Source));
  
    while (1) {
@@ -1272,6 +1273,324 @@ TEST_F(CommentLexerTest, HTML20) {
    }
  }
  
+TEST_F(CommentLexerTest, HTMLCharacterReferences1) {
+  const char *Source = "// &";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&"),    Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences2) {
+  const char *Source = "// &!";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&"),    Toks[1].getText());
+
+  ASSERT_EQ(tok::text,         Toks[2].getKind());
+  ASSERT_EQ(StringRef("!"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences3) {
+  const char *Source = "// &amp";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&amp"), Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences4) {
+  const char *Source = "// &amp!";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&amp"), Toks[1].getText());
+
+  ASSERT_EQ(tok::text,         Toks[2].getKind());
+  ASSERT_EQ(StringRef("!"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences5) {
+  const char *Source = "// &#";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#"),   Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences6) {
+  const char *Source = "// &#a";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#"),   Toks[1].getText());
+
+  ASSERT_EQ(tok::text,         Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences7) {
+  const char *Source = "// &#42";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#42"), Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences8) {
+  const char *Source = "// &#42a";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#42"), Toks[1].getText());
+
+  ASSERT_EQ(tok::text,         Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences9) {
+  const char *Source = "// &#x";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#x"),  Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences10) {
+  const char *Source = "// &#xz";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,         Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),    Toks[0].getText());
+
+  ASSERT_EQ(tok::text,         Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#x"),  Toks[1].getText());
+
+  ASSERT_EQ(tok::text,         Toks[2].getKind());
+  ASSERT_EQ(StringRef("z"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,      Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences11) {
+  const char *Source = "// &#xab";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::text,          Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#xab"), Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences12) {
+  const char *Source = "// &#xaBz";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::text,          Toks[1].getKind());
+  ASSERT_EQ(StringRef("&#xaB"), Toks[1].getText());
+
+  ASSERT_EQ(tok::text,          Toks[2].getKind());
+  ASSERT_EQ(StringRef("z"),     Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences13) {
+  const char *Source = "// &amp;";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(3U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::text,          Toks[1].getKind());
+  ASSERT_EQ(StringRef("&"),     Toks[1].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences14) {
+  const char *Source = "// &amp;&lt;";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::text,          Toks[1].getKind());
+  ASSERT_EQ(StringRef("&"),     Toks[1].getText());
+
+  ASSERT_EQ(tok::text,          Toks[2].getKind());
+  ASSERT_EQ(StringRef("<"),     Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences15) {
+  const char *Source = "// &amp; meow";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::text,          Toks[1].getKind());
+  ASSERT_EQ(StringRef("&"),     Toks[1].getText());
+
+  ASSERT_EQ(tok::text,          Toks[2].getKind());
+  ASSERT_EQ(StringRef(" meow"), Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences16) {
+  const char *Sources[] = {
+    "// &#61;",
+    "// &#x3d;",
+    "// &#X3d;"
+  };
+
+  for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) {
+    std::vector<Token> Toks;
+
+    lexString(Sources[i], Toks);
+
+    ASSERT_EQ(3U, Toks.size());
+
+    ASSERT_EQ(tok::text,          Toks[0].getKind());
+    ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+    ASSERT_EQ(tok::text,          Toks[1].getKind());
+    ASSERT_EQ(StringRef("="),     Toks[1].getText());
+
+    ASSERT_EQ(tok::newline,       Toks[2].getKind());
+  }
+}
+
  TEST_F(CommentLexerTest, MultipleComments) {
    const char *Source =
      "// Aaa\n"
diff --git a/unittests/AST/CommentParser.cpp b/unittests/AST/CommentParser.cpp

index 47433aee2be5e72ea145ad83fb3a710528370fea..ff931955595a9cb6e263700b37eb1e368d051112 100644 (file)
--- a/unittests/AST/CommentParser.cpp
+++ b/unittests/AST/CommentParser.cpp
@@ -54,7 +54,7 @@ FullComment *CommentParserTest::parseString(const char *Source) {
    FileID File = SourceMgr.createFileIDForMemBuffer(Buf);
    SourceLocation Begin = SourceMgr.getLocForStartOfFile(File);
  
-  comments::Lexer L(Begin, CommentOptions(),
+  comments::Lexer L(Allocator, Begin, CommentOptions(),
                      Source, Source + strlen(Source));
  
    comments::Sema S(Allocator, SourceMgr, Diags);
author	Dmitri Gribenko <gribozavr@gmail.com>
	Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)
committer	Dmitri Gribenko <gribozavr@gmail.com>
	Fri, 27 Jul 2012 20:37:06 +0000 (20:37 +0000)
include/clang/AST/CommentLexer.h		patch \| blob \| history
lib/AST/ASTContext.cpp		patch \| blob \| history
lib/AST/CommentLexer.cpp		patch \| blob \| history
lib/AST/RawCommentList.cpp		patch \| blob \| history
test/Index/annotate-comments.cpp		patch \| blob \| history
unittests/AST/CommentLexer.cpp		patch \| blob \| history
unittests/AST/CommentParser.cpp		patch \| blob \| history