]> granicus.if.org Git - clang/commitdiff
enhance sourcemgr to detect various UTF BOM's and emit a fatal error
authorChris Lattner <sabre@nondot.org>
Tue, 20 Apr 2010 18:14:03 +0000 (18:14 +0000)
committerChris Lattner <sabre@nondot.org>
Tue, 20 Apr 2010 18:14:03 +0000 (18:14 +0000)
about it instead of producing tons of garbage from the lexer.

It would be even better for sourcemgr to dynamically transcode (e.g.
from UTF16 -> UTF8).

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@101924 91177308-0d34-0410-b5e6-96231b3b80d8

include/clang/Basic/DiagnosticCommonKinds.td
lib/Basic/SourceManager.cpp
test/Lexer/utf-16.c [new file with mode: 0644]
test/Lexer/utf-16.c.txt [new file with mode: 0644]

index 5e28f49adeea16a57b6238e3f740695dd8febba9..88e7dc19aec52d4a7b28c8001db1247daeb0b186 100644 (file)
@@ -72,5 +72,6 @@ def err_target_invalid_feature : Error<"invalid target feature '%0'">;
 def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal;
 def err_file_modified : Error<
   "file '%0' modified since it was first processed">, DefaultFatal;
-  
+def err_unsupported_bom : Error<"%0 byte order mark detected in '%1', but "
+  "encoding is not supported">, DefaultFatal;
 }
index 053cfe333d0b5308b477a22c6d896f4a9bfccc55..c76624139a08c4e20fd2d6876920eec18f9a1d49 100644 (file)
@@ -119,6 +119,41 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag,
       Buffer.setInt(true);
 #endif
     }
+    
+    // If the buffer is valid, check to see if it has a UTF Byte Order Mark
+    // (BOM).  We only support UTF-8 without a BOM right now.  See
+    // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
+    if (!Buffer.getInt()) {
+      llvm::StringRef BufStr = Buffer.getPointer()->getBuffer();
+      const char *BOM = 0;
+      if (BufStr.startswith("\xFE\xBB\xBF"))
+        BOM = "UTF-8";
+      else if (BufStr.startswith("\xFE\xFF"))
+        BOM = "UTF-16 (BE)";
+      else if (BufStr.startswith("\xFF\xFE"))
+        BOM = "UTF-16 (LE)";
+      else if (BufStr.startswith(llvm::StringRef("\x00\x00\xFE\xFF", 4)))
+        BOM = "UTF-32 (BE)";
+      else if (BufStr.startswith(llvm::StringRef("\xFF\xFE\x00\x00", 4)))
+        BOM = "UTF-32 (LE)";
+      else if (BufStr.startswith("\x2B\x2F\x76"))
+        BOM = "UTF-7";
+      else if (BufStr.startswith("\xF7\x64\x4C"))
+        BOM = "UTF-1";
+      else if (BufStr.startswith("\xDD\x73\x66\x73"))
+        BOM = "UTF-EBCDIC";
+      else if (BufStr.startswith("\x0E\xFE\xFF"))
+        BOM = "SDSU";
+      else if (BufStr.startswith("\xFB\xEE\x28"))
+        BOM = "BOCU-1";
+      else if (BufStr.startswith("\x84\x31\x95\x33"))
+        BOM = "BOCU-1";
+      
+      if (BOM) {
+        Diag.Report(diag::err_unsupported_bom) << BOM << Entry->getName();
+        Buffer.setInt(1);
+      }
+    }
   }
   
   if (Invalid)
diff --git a/test/Lexer/utf-16.c b/test/Lexer/utf-16.c
new file mode 100644 (file)
index 0000000..7c14e39
--- /dev/null
@@ -0,0 +1,4 @@
+// RUN: not %clang -xc %s.txt -fsyntax-only 2>&1 | grep 'UTF-16 (LE) byte order mark detected'
+// rdar://7876588
+
+// This test verifies that clang gives a decent error for UTF-16 source files.
\ No newline at end of file
diff --git a/test/Lexer/utf-16.c.txt b/test/Lexer/utf-16.c.txt
new file mode 100644 (file)
index 0000000..4f3d169
Binary files /dev/null and b/test/Lexer/utf-16.c.txt differ