]> granicus.if.org Git - icu/commitdiff
ICU-21248 Adds source file check (UTF-8 and absence of BOM) to
authorgnrunge <nrunge@google.com>
Thu, 17 Sep 2020 21:55:00 +0000 (14:55 -0700)
committerNorbert Runge <41129501+gnrunge@users.noreply.github.com>
Fri, 18 Sep 2020 20:17:50 +0000 (13:17 -0700)
Travis Continued Integration.

ICU-21248 Factors in review comments.

ICU-21248 Changes: no more filtering of markdown files, moved the
script to icu/tools/script/ directory, removed BOM from one
README.md file.

ICU-21248 Adjusts path to the icu-file-utf8-check script.

ICU-21248 Extends coverage of UTF-8/BOM check to all of icu/ directory.

.travis.yml
icu4c/source/test/testdata/break_rules/README.md
tools/scripts/icu-file-utf8-check.py [moved from icu4c/source/tools/icu-file-utf8-check.py with 69% similarity]

index 9fe2453361573f30215c277f68ff4bce5723baf0..ca33b5b89cfdb3d777278963362fe153081a9b8d 100644 (file)
@@ -195,3 +195,9 @@ matrix:
         - cd icu4c/source
       script:
         - test/hdrtst/testinternalheaders.sh
+
+    # Check source files for valid UTF-8 and for absence of BOM.
+    - name: "UTF-8 and BOM check"
+      os:   linux
+      script:
+        - tools/scripts/icu-file-utf8-check.py
index d2501c3cf73cca44c95b6d1195ea6f8794036ceb..1deb4dfc32f09868fe057db9d59e3feecfc02350 100644 (file)
@@ -1,4 +1,4 @@
-<!--
+<!--
 Copyright (C) 2016 and later: Unicode, Inc. and others.
 License & terms of use: http://www.unicode.org/copyright.html
 
similarity index 69%
rename from icu4c/source/tools/icu-file-utf8-check.py
rename to tools/scripts/icu-file-utf8-check.py
index 86de259e8832216369500e9578d6a7e596c30ef7..9e30e3b48664bad13512e05f605bdc962d9a49d2 100755 (executable)
@@ -22,6 +22,8 @@
 #  The tool operates recursively on the directory from which it is run.
 #  Only files from the ICU github repository are checked.
 #  No changes are made to the repository; only the working copy will be altered.
+#  The script checks all source files and returns a non-zero exit code if any of
+#  the checked files contain a non-UTF-8 character.
 
 from __future__ import print_function
 
@@ -32,6 +34,10 @@ import re
 import getopt
 
 
+# List of directories to check for UTF-8 and BOM. Currently covers
+# all of icu/. Modify as needed.
+icu_directories_to_be_scanned = ["."]
+
 def runCommand(cmd):
     output_file = os.popen(cmd);
     output_text = output_file.read();
@@ -45,13 +51,14 @@ def runCommand(cmd):
 def usage():
     print("usage: " + sys.argv[0] + " [-h | --help]")
 
-    
+
 #
 #  File check.         Check source code files for UTF-8 and all except text files for not containing a BOM
 #    file_name:        name of a text file.
 #    is_source:        Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
 #
 def check_file(file_name, is_source):
+    rc = 0
     f = open(file_name, 'rb')
     bytes = f.read()
     f.close()
@@ -61,16 +68,19 @@ def check_file(file_name, is_source):
             bytes.decode("UTF-8")
         except UnicodeDecodeError:
             print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
-    
+            rc = 1
+
     if bytes[0] == 0xef:
         if not (file_name.endswith(".txt") or file_name.endswith(".sln")
-                    or file_name.endswith(".targets")
-                    or ".vcxproj" in file_name):
+                    or file_name.endswith(".targets") or ".vcxproj" in file_name):
             print("Warning: file %s contains a UTF-8 BOM: " % file_name)
+            rc = 1
 
-    return
+    return rc
 
 def main(argv):
+    exit_status = 0
+
     try:
         opts, args = getopt.getopt(argv, "h", ("help"))
     except getopt.GetoptError:
@@ -84,23 +94,30 @@ def main(argv):
     if args:
         print("unexpected command line argument")
         usage()
-        sys.exit()
-
-    output = runCommand("git ls-files ");
-    file_list = output.splitlines()
+        sys.exit(2)
 
     source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
-    
-    for f in file_list:
-        if os.path.isdir(f):
-            print("Skipping dir " + f)
-            continue
-        if not os.path.isfile(f):
-            print("Repository file not in working copy: " + f)
-            continue;
-
-        source_file = source_file_re.match(f)
-        check_file(f, source_file)
+    git_cmd = "git ls-files DIR"
+
+    for dir in icu_directories_to_be_scanned:
+        print('Scanning ' + dir)
+        output = runCommand(git_cmd.replace("DIR", dir))
+        file_list = output.splitlines()
+
+        for f in file_list:
+            if os.path.isdir(f):
+                print("Skipping dir " + f)
+                continue
+            if not os.path.isfile(f):
+                print("Repository file not in working copy: " + f)
+                continue;
+
+            source_file = source_file_re.match(f)
+            if check_file(f, source_file) != 0:
+                exit_status = 1
+
+    print(exit_status)
+    sys.exit(exit_status)
 
 if __name__ == "__main__":
     main(sys.argv[1:])