From 6f057afad64b667bf1279e180b8d5334aa9bc858 Mon Sep 17 00:00:00 2001 From: Norbert Runge Date: Mon, 17 Sep 2018 16:27:49 -0700 Subject: [PATCH] ICU-20119 Migrates script that checks for proper UTF-8 encoding in source files and for absence of BOM in all but text files into the github environment. --- icu4c/source/tools/icu-svnprops-check.py | 128 +++++------------------ 1 file changed, 25 insertions(+), 103 deletions(-) diff --git a/icu4c/source/tools/icu-svnprops-check.py b/icu4c/source/tools/icu-svnprops-check.py index 313da136884..5576cd57368 100755 --- a/icu4c/source/tools/icu-svnprops-check.py +++ b/icu4c/source/tools/icu-svnprops-check.py @@ -7,8 +7,7 @@ # All rights reserved. # -# Script to check and fix svn property settings for ICU source files. -# Also check for the correct line endings on files with svn:eol-style = native +# Script to check ICU source files to contain only UTF-8 encoded characters. # # THIS SCRIPT DOES NOT WORK ON WINDOWS # It only works correctly on platforms where the native line ending is a plain \n @@ -17,11 +16,10 @@ # icu-svnprops-check.py [options] # # options: -# -f | --fix Fix any problems that are found # -h | --help Print a usage line and exit. # # The tool operates recursively on the directory from which it is run. -# Only files from the svn repository are checked. +# Only files from the ICU github repository are checked. # No changes are made to the repository; only the working copy will be altered. import sys @@ -31,61 +29,6 @@ import re import getopt -# file_types: The parsed form of the svn auto-props specification. -# A list of file types - .cc, .cpp, .txt, etc. -# each element is a [type, proplist] -# "type" is a regular expression string that will match a file name -# prop list is another list, one element per property. -# Each property item is a two element list, [prop name, prop value] -file_types = list() - -def parse_auto_props(): - aprops = svn_auto_props.splitlines() - for propline in aprops: - if re.match("\s*(#.*)?$", propline): # Match comment and blank lines - continue - if re.match("\s*\[auto-props\]", propline): # Match the [auto-props] line. - continue - if not re.match("\s*[^\s]+\s*=", propline): # minimal syntax check for = - print "Bad line from autoprops definitions: " + propline - continue - file_type, string_proplist = propline.split("=", 1) - - #transform the file type expression from autoprops into a normal regular expression. - # e.g. "*.cpp" ==> ".*\.cpp$" - file_type = file_type.strip() - file_type = file_type.replace(".", "\.") - file_type = file_type.replace("*", ".*") - file_type = file_type + "$" - - # example string_proplist at this point: " svn:eol-style=native;svn:executable" - # split on ';' into a list of properties. The negative lookahead and lookbehind - # in the split regexp are to prevent matching on ';;', which is an escaped ';' - # within a property value. - string_proplist = re.split("(?= 0: - prop_name, prop_val = prop.split("=", 1) - else: - # properties with no explicit value, e.g. svn:executable - prop_name, prop_val = prop, "" - prop_name = prop_name.strip() - prop_val = prop_val.strip() - # unescape any ";;" in a property value, e.g. the mime-type from - # *.java = svn:eol-style=native;svn:mime-type=text/plain;;charset=utf-8 - prop_val = prop_val.replace(";;", ";"); - # If the prop value "is quoted", remove the quotes. - # See svn:keywords for an example of a quoted prop value. - match = re.match('^"(.+)"$', prop_val) - if match: - prop_val = match.group(1) - proplist.append((prop_name, prop_val)) - - file_types.append((file_type, proplist)) - # print file_types - - def runCommand(cmd): output_file = os.popen(cmd); output_text = output_file.read(); @@ -95,49 +38,32 @@ def runCommand(cmd): sys.exit(exit_status) return output_text -svn_auto_props = runCommand("svn propget svn:auto-props http://source.icu-project.org/repos/icu") def usage(): print "usage: " + sys.argv[0] + " [-f | --fix] [-h | --help]" # -# UTF-8 file check. For text files with svn:mime-type=text/anything, check the specified charset +# File check. Check source code files for UTF-8 and all except text files for not containing a BOM # file_name: name of a text file. -# base_mime_type: svn:mime-type property from the auto-props settings for this file type. -# actual_mime_type: existing svn:mime-type property value for the file. -# return: The correct svn:mime-type property value, -# either the original, if it looks OK, otherwise the value from auto-props +# is_source: Flag, set to True if file is a souece code file (.c, .cpp, .h, .java). # -def check_utf8(file_name, base_mime_type, actual_mime_type): - +def check_file(file_name, is_source): f = open(file_name, 'r') bytes = f.read() f.close() - file_is_utf8 = True - try: - bytes.decode("UTF-8") - except UnicodeDecodeError: - file_is_utf8 = False - - if not file_is_utf8 and actual_mime_type.find("utf-8") >= 0: - print "Error: %s is not valid utf-8, but has a utf-8 mime type." % file_name - return actual_mime_type - - if file_is_utf8 and actual_mime_type.find("charset") >=0 and actual_mime_type.find("utf-8") < 0: - print "Warning: %s is valid utf-8, but has a mime-type of %s." % (file_name, actual_mime_type) + if is_source: + try: + bytes.decode("UTF-8") + except UnicodeDecodeError: + print "Error: %s is a source code file but contains non-utf-8 bytes." % file_name + if ord(bytes[0]) == 0xef: if not file_name.endswith(".txt"): print "Warning: file %s contains a UTF-8 BOM: " % file_name - # If the file already has a charset in its mime-type, don't make any change. - - if actual_mime_type.find("charset=") >= 0: - return actual_mime_type; - - return base_mime_type - + return def main(argv): fix_problems = False; @@ -158,32 +84,28 @@ def main(argv): usage() sys.exit() - parse_auto_props() - output = runCommand("svn ls -R "); + output = runCommand("git ls-files "); file_list = output.splitlines() + source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") + for f in file_list: + print "Processing file " + f if os.path.isdir(f): - # print "Skipping dir " + f + print "Skipping dir " + f continue if not os.path.isfile(f): print "Repository file not in working copy: " + f continue; - for file_pattern, props in file_types: - if re.match(file_pattern, f): - # print "doing " + f - for propname, propval in props: - actual_propval = runCommand("svn propget --strict " + propname + " " + f) - #print propname + ": " + actual_propval - if propname == "svn:mime-type" and propval.find("text/") == 0: - # check for UTF-8 text files, should have svn:mime-type=text/something; charset=utf8 - propval = check_utf8(f, propval, actual_propval) - if not (propval == actual_propval or (propval == "" and actual_propval == "*")): - print "svn propset %s '%s' %s" % (propname, propval, f) - if fix_problems: - os.system("svn propset %s '%s' %s" % (propname, propval, f)) - + if source_file_re.match(f): + print "Checking source file: " + f + source_file = True + check_file(f, source_file); + else: + print "Checking non-source file: " + f + source_file = False + check_file(f, source_file) if __name__ == "__main__": main(sys.argv[1:]) -- 2.40.0