From d61a5cefe6aa7f71d748299d7a4b3cfe14a70cea Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Fri, 7 Sep 2018 19:25:39 +0000 Subject: [PATCH] PR38870: Add warning for zero-width unicode characters appearing in identifiers. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@341700 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/DiagnosticLexKinds.td | 3 +++ lib/Lex/Lexer.cpp | 21 ++++++++++++++++++--- test/Lexer/unicode.c | 7 +++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 1c960711bc..8cf6d7e7c0 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn< def warn_utf8_symbol_homoglyph : Warning< "treating Unicode character as identifier character rather than " "as '%1' symbol">, InGroup>; +def warn_utf8_symbol_zero_width : Warning< + "identifier contains Unicode character that is invisible in " + "some environments">, InGroup>; def err_hex_escape_no_digits : Error< "\\%0 used with no following hex digits">; diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index e8588a771a..6a69bb4974 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, bool operator<(HomoglyphPair R) const { return Character < R.Character; } }; static constexpr HomoglyphPair SortedHomoglyphs[] = { + {U'\u00ad', 0}, // SOFT HYPHEN {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK {U'\u037e', ';'}, // GREEK QUESTION MARK + {U'\u200b', 0}, // ZERO WIDTH SPACE + {U'\u200c', 0}, // ZERO WIDTH NON-JOINER + {U'\u200d', 0}, // ZERO WIDTH JOINER + {U'\u2060', 0}, // WORD JOINER + {U'\u2061', 0}, // FUNCTION APPLICATION + {U'\u2062', 0}, // INVISIBLE TIMES + {U'\u2063', 0}, // INVISIBLE SEPARATOR + {U'\u2064', 0}, // INVISIBLE PLUS {U'\u2212', '-'}, // MINUS SIGN {U'\u2215', '/'}, // DIVISION SLASH {U'\u2216', '\\'}, // SET MINUS @@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, {U'\u2236', ':'}, // RATIO {U'\u223c', '~'}, // TILDE OPERATOR {U'\ua789', ':'}, // MODIFIER LETTER COLON + {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN @@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, llvm::raw_svector_ostream CharOS(CharBuf); llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); } - const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; - Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) - << Range << CharBuf << LooksLikeStr; + if (Homoglyph->LooksLike) { + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) + << Range << CharBuf << LooksLikeStr; + } else { + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) + << Range << CharBuf; + } } } diff --git a/test/Lexer/unicode.c b/test/Lexer/unicode.c index 30e353fa79..bebab82988 100644 --- a/test/Lexer/unicode.c +++ b/test/Lexer/unicode.c @@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{treating Unicode character as identi int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character as identifier character rather than as ':' symbol}} // expected-warning@-1 {{treating Unicode character as identifier character rather than as ';' symbol}} int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}} + +int ⁠xx‍; +// expected-warning@-1 {{identifier contains Unicode character that is invisible in some environments}} +// expected-warning@-2 {{identifier contains Unicode character that is invisible in some environments}} +// expected-warning@-3 {{identifier contains Unicode character that is invisible in some environments}} +int foo​bar = 0; // expected-warning {{identifier contains Unicode character that is invisible in some environments}} +int x = foobar; // expected-error {{undeclared identifier}} -- 2.40.0