From: Chris Lattner Date: Tue, 20 Apr 2010 18:14:03 +0000 (+0000) Subject: enhance sourcemgr to detect various UTF BOM's and emit a fatal error X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=38caec48bc1c9816ca59b8d164a64447ee208c2e;p=clang enhance sourcemgr to detect various UTF BOM's and emit a fatal error about it instead of producing tons of garbage from the lexer. It would be even better for sourcemgr to dynamically transcode (e.g. from UTF16 -> UTF8). git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@101924 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/DiagnosticCommonKinds.td b/include/clang/Basic/DiagnosticCommonKinds.td index 5e28f49ade..88e7dc19ae 100644 --- a/include/clang/Basic/DiagnosticCommonKinds.td +++ b/include/clang/Basic/DiagnosticCommonKinds.td @@ -72,5 +72,6 @@ def err_target_invalid_feature : Error<"invalid target feature '%0'">; def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal; def err_file_modified : Error< "file '%0' modified since it was first processed">, DefaultFatal; - +def err_unsupported_bom : Error<"%0 byte order mark detected in '%1', but " + "encoding is not supported">, DefaultFatal; } diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp index 053cfe333d..c76624139a 100644 --- a/lib/Basic/SourceManager.cpp +++ b/lib/Basic/SourceManager.cpp @@ -119,6 +119,41 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag, Buffer.setInt(true); #endif } + + // If the buffer is valid, check to see if it has a UTF Byte Order Mark + // (BOM). We only support UTF-8 without a BOM right now. See + // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + if (!Buffer.getInt()) { + llvm::StringRef BufStr = Buffer.getPointer()->getBuffer(); + const char *BOM = 0; + if (BufStr.startswith("\xFE\xBB\xBF")) + BOM = "UTF-8"; + else if (BufStr.startswith("\xFE\xFF")) + BOM = "UTF-16 (BE)"; + else if (BufStr.startswith("\xFF\xFE")) + BOM = "UTF-16 (LE)"; + else if (BufStr.startswith(llvm::StringRef("\x00\x00\xFE\xFF", 4))) + BOM = "UTF-32 (BE)"; + else if (BufStr.startswith(llvm::StringRef("\xFF\xFE\x00\x00", 4))) + BOM = "UTF-32 (LE)"; + else if (BufStr.startswith("\x2B\x2F\x76")) + BOM = "UTF-7"; + else if (BufStr.startswith("\xF7\x64\x4C")) + BOM = "UTF-1"; + else if (BufStr.startswith("\xDD\x73\x66\x73")) + BOM = "UTF-EBCDIC"; + else if (BufStr.startswith("\x0E\xFE\xFF")) + BOM = "SDSU"; + else if (BufStr.startswith("\xFB\xEE\x28")) + BOM = "BOCU-1"; + else if (BufStr.startswith("\x84\x31\x95\x33")) + BOM = "BOCU-1"; + + if (BOM) { + Diag.Report(diag::err_unsupported_bom) << BOM << Entry->getName(); + Buffer.setInt(1); + } + } } if (Invalid) diff --git a/test/Lexer/utf-16.c b/test/Lexer/utf-16.c new file mode 100644 index 0000000000..7c14e3932b --- /dev/null +++ b/test/Lexer/utf-16.c @@ -0,0 +1,4 @@ +// RUN: not %clang -xc %s.txt -fsyntax-only 2>&1 | grep 'UTF-16 (LE) byte order mark detected' +// rdar://7876588 + +// This test verifies that clang gives a decent error for UTF-16 source files. \ No newline at end of file diff --git a/test/Lexer/utf-16.c.txt b/test/Lexer/utf-16.c.txt new file mode 100644 index 0000000000..4f3d1694bb Binary files /dev/null and b/test/Lexer/utf-16.c.txt differ