From: Jordan Rose Date: Thu, 24 Jan 2013 20:50:50 +0000 (+0000) Subject: As an extension, treat Unicode whitespace characters as whitespace. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fc12060ed595fd23d731b8a86adb21ddbb8c7bfb;p=clang As an extension, treat Unicode whitespace characters as whitespace. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173370 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index e6ffca9554..2a57e6fced 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -2791,7 +2791,30 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, return CodePoint; } +static bool isUnicodeWhitespace(uint32_t C) { + return (C == 0x0085 || C == 0x00A0 || C == 0x1680 || + C == 0x180E || (C >= 0x2000 && C <= 0x200A) || + C == 0x2028 || C == 0x2029 || C == 0x202F || + C == 0x205F || C == 0x3000); +} + void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { + if (isUnicodeWhitespace(C)) { + if (!isLexingRawMode()) { + CharSourceRange CharRange = + CharSourceRange::getCharRange(getSourceLocation(), + getSourceLocation(CurPtr)); + Diag(BufferPtr, diag::ext_unicode_whitespace) + << CharRange; + } + + Result.setFlag(Token::LeadingSpace); + if (SkipWhitespace(Result, CurPtr)) + return; // KeepWhitespaceMode + + return LexTokenInternal(Result); + } + if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) { MIOpt.ReadToken(); return LexIdentifier(Result, CurPtr); diff --git a/test/Lexer/unicode.c b/test/Lexer/unicode.c new file mode 100644 index 0000000000..1d7b53e2c5 --- /dev/null +++ b/test/Lexer/unicode.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// This file contains Unicode characters; please do not "fix" them! + +extern int x; // expected-warning {{treating Unicode character as whitespace}} +extern int x; // expected-warning {{treating Unicode character as whitespace}}