Comment lexing: fix lexing to actually work in non-error cases.

author Dmitri Gribenko <gribozavr@gmail.com>

Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)

committer Dmitri Gribenko <gribozavr@gmail.com>

Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)
author Dmitri Gribenko <gribozavr@gmail.com>
Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)
committer Dmitri Gribenko <gribozavr@gmail.com>
Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp

index 77d2a9b72dd9a4104ddc0a4304d184618aba4b24..55cd409a9ca16435ec549bb076335f8abd058b0d 100644 (file)
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -147,6 +147,11 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    return BufferPtr;
  }
  
+bool isHTMLIdentifierStartingCharacter(char C) {
+  return (C >= 'a' && C <= 'z') ||
+         (C >= 'A' && C <= 'Z');
+}
+
  bool isHTMLIdentifierCharacter(char C) {
    return (C >= 'a' && C <= 'z') ||
           (C >= 'A' && C <= 'Z') ||
@@ -357,7 +362,7 @@ void Lexer::lexCommentText(Token &T) {
            return;
          }
          const char C = *TokenPtr;
-        if (isHTMLIdentifierCharacter(C))
+        if (isHTMLIdentifierStartingCharacter(C))
            setupAndLexHTMLOpenTag(T);
          else if (C == '/')
            setupAndLexHTMLCloseTag(T);
@@ -383,7 +388,7 @@ void Lexer::lexCommentText(Token &T) {
            TokenPtr++;
            if (TokenPtr == CommentEnd)
              break;
-          char C = *TokenPtr;
+          const char C = *TokenPtr;
            if(C == '\n' || C == '\r' ||
               C == '\\' || C == '@' || C == '<')
              break;
@@ -492,7 +497,8 @@ void Lexer::lexVerbatimLineText(Token &T) {
  }
  
  void Lexer::setupAndLexHTMLOpenTag(Token &T) {
-  assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1]));
+  assert(BufferPtr[0] == '<' &&
+         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
    const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  
    StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
@@ -501,12 +507,9 @@ void Lexer::setupAndLexHTMLOpenTag(Token &T) {
  
    BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  
-  if (BufferPtr != CommentEnd && *BufferPtr == '>') {
-    BufferPtr++;
-    return;
-  }
-
-  if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr))
+  const char C = *BufferPtr;
+  if (BufferPtr != CommentEnd &&
+      (C == '>' || isHTMLIdentifierStartingCharacter(C)))
      State = LS_HTMLOpenTag;
  }
  
@@ -541,7 +544,8 @@ void Lexer::lexHTMLOpenTag(Token &T) {
      case '>':
        TokenPtr++;
        formTokenWithChars(T, TokenPtr, tok::html_greater);
-      break;
+      State = LS_Normal;
+      return;
      }
    }
  
@@ -554,7 +558,7 @@ void Lexer::lexHTMLOpenTag(Token &T) {
    }
  
    C = *BufferPtr;
-  if (!isHTMLIdentifierCharacter(C) &&
+  if (!isHTMLIdentifierStartingCharacter(C) &&
        C != '=' && C != '\"' && C != '\'' && C != '>') {
      State = LS_Normal;
      return;
@@ -656,8 +660,9 @@ again:
        EndWhitespace++;
  
      // Turn any whitespace between comments (and there is only whitespace
-    // between them) into a newline.  We have two newlines between C comments
-    // in total (first one was synthesized after a comment).
+    // between them -- guaranteed by comment extraction) into a newline.  We
+    // have two newlines between C comments in total (first one was synthesized
+    // after a comment).
      formTokenWithChars(T, EndWhitespace, tok::newline);
  
      CommentState = LCS_BeforeComment;
diff --git a/unittests/AST/CommentLexer.cpp b/unittests/AST/CommentLexer.cpp

index 0a52364987ebea8de7ab430cc485d79e38b3b3f5..e1089cc5dce3b6b075dd3c183cd4b84ea9f8bd31 100644 (file)
--- a/unittests/AST/CommentLexer.cpp
+++ b/unittests/AST/CommentLexer.cpp
@@ -802,6 +802,28 @@ TEST_F(CommentLexerTest, HTML1) {
  }
  
  TEST_F(CommentLexerTest, HTML2) {
+  const char *Source =
+    "// a<2";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,       Toks[0].getKind());
+  ASSERT_EQ(StringRef(" a"), Toks[0].getText());
+
+  ASSERT_EQ(tok::text,       Toks[1].getKind());
+  ASSERT_EQ(StringRef("<"),  Toks[1].getText());
+
+  ASSERT_EQ(tok::text,       Toks[2].getKind());
+  ASSERT_EQ(StringRef("2"),  Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,    Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML3) {
    const char *Source =
      "// < tag";
  
@@ -823,7 +845,7 @@ TEST_F(CommentLexerTest, HTML2) {
    ASSERT_EQ(tok::newline,      Toks[3].getKind());
  }
  
-TEST_F(CommentLexerTest, HTML3) {
+TEST_F(CommentLexerTest, HTML4) {
    const char *Sources[] = {
      "// <tag",
      "// <tag "
@@ -846,7 +868,52 @@ TEST_F(CommentLexerTest, HTML3) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML4) {
+TEST_F(CommentLexerTest, HTML5) {
+  const char *Source =
+    "// <tag 42";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(4U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::html_tag_open, Toks[1].getKind());
+  ASSERT_EQ(StringRef("tag"),   Toks[1].getHTMLTagOpenName());
+
+  ASSERT_EQ(tok::text,          Toks[2].getKind());
+  ASSERT_EQ(StringRef("42"),    Toks[2].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML6) {
+  const char *Source = "// <tag> Meow";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(5U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::html_tag_open, Toks[1].getKind());
+  ASSERT_EQ(StringRef("tag"),   Toks[1].getHTMLTagOpenName());
+
+  ASSERT_EQ(tok::html_greater,  Toks[2].getKind());
+
+  ASSERT_EQ(tok::text,          Toks[3].getKind());
+  ASSERT_EQ(StringRef(" Meow"), Toks[3].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[4].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML7) {
    const char *Source = "// <tag=";
  
    std::vector<Token> Toks;
@@ -867,7 +934,35 @@ TEST_F(CommentLexerTest, HTML4) {
    ASSERT_EQ(tok::newline,       Toks[3].getKind());
  }
  
-TEST_F(CommentLexerTest, HTML5) {
+TEST_F(CommentLexerTest, HTML8) {
+  const char *Source = "// <tag attr=> Meow";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(7U, Toks.size());
+
+  ASSERT_EQ(tok::text,          Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "),     Toks[0].getText());
+
+  ASSERT_EQ(tok::html_tag_open, Toks[1].getKind());
+  ASSERT_EQ(StringRef("tag"),   Toks[1].getHTMLTagOpenName());
+
+  ASSERT_EQ(tok::html_ident,    Toks[2].getKind());
+  ASSERT_EQ(StringRef("attr"),  Toks[2].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals,   Toks[3].getKind());
+
+  ASSERT_EQ(tok::html_greater,  Toks[4].getKind());
+
+  ASSERT_EQ(tok::text,          Toks[5].getKind());
+  ASSERT_EQ(StringRef(" Meow"), Toks[5].getText());
+
+  ASSERT_EQ(tok::newline,       Toks[6].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML9) {
    const char *Sources[] = {
      "// <tag attr",
      "// <tag attr "
@@ -893,7 +988,7 @@ TEST_F(CommentLexerTest, HTML5) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML6) {
+TEST_F(CommentLexerTest, HTML10) {
    const char *Sources[] = {
      "// <tag attr=",
      "// <tag attr ="
@@ -921,7 +1016,7 @@ TEST_F(CommentLexerTest, HTML6) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML7) {
+TEST_F(CommentLexerTest, HTML11) {
    const char *Sources[] = {
      "// <tag attr=\"",
      "// <tag attr = \"",
@@ -954,7 +1049,7 @@ TEST_F(CommentLexerTest, HTML7) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML8) {
+TEST_F(CommentLexerTest, HTML12) {
    const char *Source = "// <tag attr=@";
  
    std::vector<Token> Toks;
@@ -980,7 +1075,7 @@ TEST_F(CommentLexerTest, HTML8) {
    ASSERT_EQ(tok::newline,       Toks[5].getKind());
  }
  
-TEST_F(CommentLexerTest, HTML9) {
+TEST_F(CommentLexerTest, HTML13) {
    const char *Sources[] = {
      "// <tag attr=\"val\\\"\\'val",
      "// <tag attr=\"val\\\"\\'val\"",
@@ -1013,7 +1108,7 @@ TEST_F(CommentLexerTest, HTML9) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML10) {
+TEST_F(CommentLexerTest, HTML14) {
    const char *Sources[] = {
      "// <tag attr=\"val\\\"\\'val\">",
      "// <tag attr=\'val\\\"\\'val\'>"
@@ -1046,7 +1141,7 @@ TEST_F(CommentLexerTest, HTML10) {
    }
  }
  
-TEST_F(CommentLexerTest, HTML11) {
+TEST_F(CommentLexerTest, HTML15) {
    const char *Source = "// </";
  
    std::vector<Token> Toks;
@@ -1065,7 +1160,7 @@ TEST_F(CommentLexerTest, HTML11) {
  }
  
  
-TEST_F(CommentLexerTest, HTML12) {
+TEST_F(CommentLexerTest, HTML16) {
    const char *Source = "// </@";
  
    std::vector<Token> Toks;
@@ -1086,7 +1181,7 @@ TEST_F(CommentLexerTest, HTML12) {
    ASSERT_EQ(tok::newline,        Toks[3].getKind());
  }
  
-TEST_F(CommentLexerTest, HTML13) {
+TEST_F(CommentLexerTest, HTML17) {
    const char *Source = "// </tag";
  
    std::vector<Token> Toks;
@@ -1104,7 +1199,7 @@ TEST_F(CommentLexerTest, HTML13) {
    ASSERT_EQ(tok::newline,        Toks[2].getKind());
  }
  
-TEST_F(CommentLexerTest, HTML14) {
+TEST_F(CommentLexerTest, HTML18) {
    const char *Sources[] = {
      "// </tag>",
      "// </ tag>",
author	Dmitri Gribenko <gribozavr@gmail.com>
	Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)
committer	Dmitri Gribenko <gribozavr@gmail.com>
	Mon, 9 Jul 2012 21:32:40 +0000 (21:32 +0000)
lib/AST/CommentLexer.cpp		patch \| blob \| history
unittests/AST/CommentLexer.cpp		patch \| blob \| history