]> granicus.if.org Git - clang/commitdiff
PR38870: Add warning for zero-width unicode characters appearing in
authorRichard Smith <richard-llvm@metafoo.co.uk>
Fri, 7 Sep 2018 19:25:39 +0000 (19:25 +0000)
committerRichard Smith <richard-llvm@metafoo.co.uk>
Fri, 7 Sep 2018 19:25:39 +0000 (19:25 +0000)
identifiers.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@341700 91177308-0d34-0410-b5e6-96231b3b80d8

include/clang/Basic/DiagnosticLexKinds.td
lib/Lex/Lexer.cpp
test/Lexer/unicode.c

index 1c960711bccb6028aa7ec4cc5c8fc4c241297e5f..8cf6d7e7c09593e2f019e6c299952a623a46eec7 100644 (file)
@@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn<
 def warn_utf8_symbol_homoglyph : Warning<
   "treating Unicode character <U+%0> as identifier character rather than "
   "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
+def warn_utf8_symbol_zero_width : Warning<
+  "identifier contains Unicode character <U+%0> that is invisible in "
+  "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
 
 def err_hex_escape_no_digits : Error<
   "\\%0 used with no following hex digits">;
index e8588a771a431186b09370294e4db54dd91d8c83..6a69bb4974af1e5d43c74c9706117a4d50267cca 100644 (file)
@@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
     bool operator<(HomoglyphPair R) const { return Character < R.Character; }
   };
   static constexpr HomoglyphPair SortedHomoglyphs[] = {
+    {U'\u00ad', 0},   // SOFT HYPHEN
     {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
     {U'\u037e', ';'}, // GREEK QUESTION MARK
+    {U'\u200b', 0},   // ZERO WIDTH SPACE
+    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
+    {U'\u200d', 0},   // ZERO WIDTH JOINER
+    {U'\u2060', 0},   // WORD JOINER
+    {U'\u2061', 0},   // FUNCTION APPLICATION
+    {U'\u2062', 0},   // INVISIBLE TIMES
+    {U'\u2063', 0},   // INVISIBLE SEPARATOR
+    {U'\u2064', 0},   // INVISIBLE PLUS
     {U'\u2212', '-'}, // MINUS SIGN
     {U'\u2215', '/'}, // DIVISION SLASH
     {U'\u2216', '\\'}, // SET MINUS
@@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
     {U'\u2236', ':'}, // RATIO
     {U'\u223c', '~'}, // TILDE OPERATOR
     {U'\ua789', ':'}, // MODIFIER LETTER COLON
+    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
     {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
     {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
     {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
@@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
       llvm::raw_svector_ostream CharOS(CharBuf);
       llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
     }
-    const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
-    Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
-        << Range << CharBuf << LooksLikeStr;
+    if (Homoglyph->LooksLike) {
+      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+          << Range << CharBuf << LooksLikeStr;
+    } else {
+      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
+          << Range << CharBuf;
+    }
   }
 }
 
index 30e353fa797e0e63fcc8d55a1a2132820aaaa351..bebab829880c196158869aad0a0638d8e785f78c 100644 (file)
@@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identi
 int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
                  // expected-warning@-1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
 int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}}
+
+int ⁠xx‍;
+// expected-warning@-1 {{identifier contains Unicode character <U+2060> that is invisible in some environments}}
+// expected-warning@-2 {{identifier contains Unicode character <U+FEFF> that is invisible in some environments}}
+// expected-warning@-3 {{identifier contains Unicode character <U+200D> that is invisible in some environments}}
+int foo​bar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}}
+int x = foobar; // expected-error {{undeclared identifier}}