From: Shane Carr Date: Thu, 13 Dec 2018 07:55:13 +0000 (-0800) Subject: ICU-10923 Python logic for resource filters. X-Git-Tag: release-64-rc~191 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b0d572c7f1766cb2cf452f5a83b94de67806e393;p=icu ICU-10923 Python logic for resource filters. - Adds schema validation for config file - Adds JSON comment stripping utility --- diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py index a3334d5d72e..f305bd589a7 100644 --- a/icu4c/source/data/BUILDRULES.py +++ b/icu4c/source/data/BUILDRULES.py @@ -21,31 +21,6 @@ def generate(config, glob, common_vars): print("Error: Cannot find data directory; please specify --glob_dir", file=sys.stderr) exit(1) - # DIRECTORIES - build_dirs = [ - "{OUT_DIR}", - "{OUT_DIR}/curr", - "{OUT_DIR}/lang", - "{OUT_DIR}/region", - "{OUT_DIR}/zone", - "{OUT_DIR}/unit", - "{OUT_DIR}/brkitr", - "{OUT_DIR}/coll", - "{OUT_DIR}/rbnf", - "{OUT_DIR}/translit", - "{TMP_DIR}", - "{TMP_DIR}/curr", - "{TMP_DIR}/lang", - "{TMP_DIR}/locales", - "{TMP_DIR}/region", - "{TMP_DIR}/zone", - "{TMP_DIR}/unit", - "{TMP_DIR}/coll", - "{TMP_DIR}/rbnf", - "{TMP_DIR}/translit", - "{TMP_DIR}/brkitr" - ] - requests += generate_cnvalias(config, glob, common_vars) requests += generate_confusables(config, glob, common_vars) requests += generate_conversion_mappings(config, glob, common_vars) @@ -154,7 +129,7 @@ def generate(config, glob, common_vars): ) ] - return (build_dirs, requests) + return requests def generate_cnvalias(config, glob, common_vars): diff --git a/icu4c/source/data/buildtool/__main__.py b/icu4c/source/data/buildtool/__main__.py index 23e95723251..7ef4d981bc9 100644 --- a/icu4c/source/data/buildtool/__main__.py +++ b/icu4c/source/data/buildtool/__main__.py @@ -8,9 +8,11 @@ from __future__ import print_function import argparse import glob as pyglob import json +import os import sys from . import * +from .comment_stripper import CommentStripper from .renderers import makefile, windirect from . import filtration, utils import BUILDRULES @@ -118,15 +120,38 @@ class Config(object): try: with open(args.filter_file, "r") as f: print("Note: Applying filters from %s." % args.filter_file, file=sys.stderr) - try: - import hjson - self.filters_json_data = hjson.load(f) - except ImportError: - self.filters_json_data = json.load(f) + self._parse_filter_file(f) except IOError: print("Error: Could not read filter file %s." % args.filter_file, file=sys.stderr) exit(1) + def _parse_filter_file(self, f): + # Use the Hjson parser if it is available; otherwise, use vanilla JSON. + try: + import hjson + self.filters_json_data = hjson.load(f) + except ImportError: + self.filters_json_data = json.load(CommentStripper(f)) + + # Optionally pre-validate the JSON schema before further processing. + # Some schema errors will be caught later, but this step ensures + # maximal validity. + try: + import jsonschema + schema_path = os.path.join(os.path.dirname(__file__), "filtration_schema.json") + with open(schema_path) as schema_f: + schema = json.load(CommentStripper(schema_f)) + validator = jsonschema.Draft4Validator(schema) + for error in validator.iter_errors(self.filters_json_data, schema): + print("WARNING: ICU data filter JSON file:", error.message, + "at", "".join( + "[%d]" % part if isinstance(part, int) else ".%s" % part + for part in error.absolute_path + ), + file=sys.stderr) + except ImportError: + pass + def has_feature(self, feature_name): assert feature_name in AVAILABLE_FEATURES return feature_name in self._feature_set @@ -166,10 +191,12 @@ def main(): # For the purposes of buildtool, force Unix-style directory separators. return [v.replace("\\", "/")[len(args.glob_dir)+1:] for v in sorted(result_paths)] - build_dirs, requests = BUILDRULES.generate(config, glob, common) + requests = BUILDRULES.generate(config, glob, common) requests = filtration.apply_filters(requests, config) requests = utils.flatten_requests(requests, config, common) + build_dirs = utils.compute_directories(requests) + if args.format == "gnumake": print(makefile.get_gnumake_rules( build_dirs, diff --git a/icu4c/source/data/buildtool/comment_stripper.py b/icu4c/source/data/buildtool/comment_stripper.py new file mode 100644 index 00000000000..4001f2f675e --- /dev/null +++ b/icu4c/source/data/buildtool/comment_stripper.py @@ -0,0 +1,51 @@ +# Copyright (C) 2018 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +import io + +class CommentStripper(object): + """Removes lines starting with "//" from a file stream.""" + + def __init__(self, f): + self.f = f + self.state = 0 + + def read(self, size=-1): + bytes = self.f.read(size) + # TODO: Do we need to read more bytes if comments were stripped + # in order to obey the size request? + return "".join(self._strip_comments(bytes)) + + def _strip_comments(self, bytes): + for byte in bytes: + if self.state == 0: + # state 0: start of a line + if byte == "/": + self.state = 1 + elif byte == "\n": + self.state = 0 + yield byte + else: + self.state = 2 + yield byte + elif self.state == 1: + # state 1: read a single '/' + if byte == "/": + self.state = 3 + elif byte == "\n": + self.state = 0 + yield "/" # the one that was skipped + yield "\n" + else: + self.state = 2 + yield "/" # the one that was skipped + yield byte + elif self.state == 2: + # state 2: middle of a line, no comment + if byte == "\n": + self.state = 0 + yield byte + elif self.state == 3: + # state 3: inside a comment + if byte == "\n": + self.state = 0 diff --git a/icu4c/source/data/buildtool/filtration.py b/icu4c/source/data/buildtool/filtration.py index e7a4c6380e6..26530581fbf 100644 --- a/icu4c/source/data/buildtool/filtration.py +++ b/icu4c/source/data/buildtool/filtration.py @@ -50,6 +50,11 @@ class Filter(object): pass +class InclusionFilter(Filter): + def match(self, file): + return True + + class ExclusionFilter(Filter): def match(self, file): return False @@ -166,6 +171,122 @@ def _preprocess_file_filters(requests, config): return filters -def _apply_resource_filters(old_requests, config): +class ResourceFilterInfo(object): + def __init__(self, category): + self.category = category + self.filter_tmp_dir = "filters/%s" % category + self.input_files = None + self.filter_files = None + self.rules_by_file = None + + def apply_to_requests(self, all_requests): + # Call this method only once per list of requests. + assert self.input_files is None + for request in all_requests: + if request.category != self.category: + continue + if not isinstance(request, AbstractExecutionRequest): + continue + if request.tool != IcuTool("genrb"): + continue + self._set_files(request.input_files) + # Add dependencies directly to dep_files + request.dep_files += self.filter_files + arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir + request.args = "%s %s" % (arg_str, request.args) + + # Make sure we found the target request + if self.input_files is None: + print("WARNING: Category not found: %s" % self.category, file=sys.stderr) + self.input_files = [] + self.filter_files = [] + self.rules_by_file = [] + + def _set_files(self, files): + # Note: The input files to genrb for a certain category should always + # be the same. For example, there are often two genrb calls: one for + # --writePoolBundle, and the other for --usePoolBundle. They are both + # expected to have the same list of input files. + if self.input_files is not None: + assert self.input_files == files + return + self.input_files = list(files) + self.filter_files = [ + TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) + for basename in ( + file.filename[file.filename.rfind("/")+1:] + for file in files + ) + ] + self.rules_by_file = [[] for _ in range(len(files))] + + def add_rules(self, file_filter, rules): + for file, rule_list in zip(self.input_files, self.rules_by_file): + if file_filter.match(file): + rule_list += rules + + def make_requests(self): + # Map from rule list to filter files with that rule list + unique_rules = defaultdict(list) + for filter_file, rules in zip(self.filter_files, self.rules_by_file): + unique_rules[tuple(rules)].append(filter_file) + + new_requests = [] + i = 0 + for rules, filter_files in unique_rules.items(): + base_filter_file = filter_files[0] + new_requests += [ + PrintFileRequest( + name = "%s_print_%d" % (self.category, i), + output_file = base_filter_file, + content = self._generate_resource_filter_txt(rules) + ) + ] + i += 1 + for filter_file in filter_files[1:]: + new_requests += [ + CopyRequest( + name = "%s_copy_%d" % (self.category, i), + input_file = base_filter_file, + output_file = filter_file + ) + ] + i += 1 + return new_requests + + @classmethod + def _generate_resource_filter_txt(cls, rules): + result = "# Caution: This file is automatically generated\n\n" + result += "\n".join(rules) + return result + + +def _apply_resource_filters(all_requests, config): """Creates filters for looking within resource bundle files.""" - return old_requests + json_data = config.filters_json_data + if "resourceFilters" not in json_data: + return all_requests + + collected = {} + for entry in json_data["resourceFilters"]: + if "files" in entry: + file_filter = Filter.create_from_json(entry["files"]) + else: + file_filter = InclusionFilter() + for category in entry["categories"]: + # not defaultdict because we need to pass arguments to the constructor + if category not in collected: + filter_info = ResourceFilterInfo(category) + filter_info.apply_to_requests(all_requests) + collected[category] = filter_info + else: + filter_info = collected[category] + filter_info.add_rules(file_filter, entry["rules"]) + + # Add the filter generation requests to the beginning so that by default + # they are made before genrb gets run (order is required by windirect) + new_requests = [] + for filter_info in collected.values(): + new_requests += filter_info.make_requests() + new_requests += all_requests + return new_requests diff --git a/icu4c/source/data/buildtool/filtration_schema.json b/icu4c/source/data/buildtool/filtration_schema.json new file mode 100644 index 00000000000..619ae2afc4b --- /dev/null +++ b/icu4c/source/data/buildtool/filtration_schema.json @@ -0,0 +1,85 @@ +// Copyright (C) 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +{ + "$id": "http://unicode.org/icu-filter-schema", + "$schema": "http://json-schema.org/draft-04/schema#", + "description": "JSON Schema for an ICU data filter file", + "type": "object", + "properties": { + "localeFilter": { "$ref": "#/definitions/filter" }, + "featureFilters": { + "type": "object", + "additionalProperties": { "$ref": "#/definitions/filter" } + }, + "resourceFilters": { + "type": "array", + "items": { + "type": "object", + "properties": { + "categories": { + "type": "array", + "items": { "type": "string" } + }, + "files": { "$ref": "#/definitions/filter" }, + "rules": { + "type": "array", + "items": { + "type": "string", + "pattern": "^[+-]/(\\w+(/\\w+)*)?$" + } + } + }, + "required": ["categories", "rules"], + "additionalProperties": false + } + } + }, + "additionalProperties": false, + "definitions": { + "filter": { + "type": "object", + "oneOf": [ + { + "properties": { + "filterType": { "$ref": "#/definitions/filterType" }, + "whitelist": { "$ref": "#/definitions/stringList" } + }, + "required": ["whitelist"], + "additionalProperties": false + }, + { + "properties": { + "filterType": { "$ref": "#/definitions/filterType" }, + "blacklist": { "$ref": "#/definitions/stringList" } + }, + "required": ["blacklist"], + "additionalProperties": false + }, + { + "properties": { + "filterType": { "$ref": "#/definitions/filterType" } + }, + "additionalProperties": false + } + ] + }, + "filterType": { + "type": "string", + "enum": [ + "file-stem", + "language", + "regex", + "exclude" + ] + }, + "stringList": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": true + } + } +} diff --git a/icu4c/source/data/buildtool/request_types.py b/icu4c/source/data/buildtool/request_types.py index 722d432232e..1890dd3c4ac 100644 --- a/icu4c/source/data/buildtool/request_types.py +++ b/icu4c/source/data/buildtool/request_types.py @@ -9,7 +9,7 @@ from abc import abstractmethod import copy import sys - +from . import * from . import utils diff --git a/icu4c/source/data/buildtool/utils.py b/icu4c/source/data/buildtool/utils.py index 8889f51a791..d072428e90f 100644 --- a/icu4c/source/data/buildtool/utils.py +++ b/icu4c/source/data/buildtool/utils.py @@ -86,6 +86,14 @@ def get_all_output_files(requests, include_tmp=False): return [f for _, f in set((type(f), f) for f in files)] +def compute_directories(requests): + dirs = set() + for file in get_all_output_files(requests, include_tmp=True): + path = "%s/%s" % (dir_for(file), file.filename) + dirs.add(path[:path.rfind("/")]) + return list(sorted(dirs)) + + class SpaceSeparatedList(list): """A list that joins itself with spaces when converted to a string.""" def __str__(self): diff --git a/icu4c/source/test/testdata/BUILDRULES.py b/icu4c/source/test/testdata/BUILDRULES.py index e959dda4eb7..4eb3ca3ff9a 100644 --- a/icu4c/source/test/testdata/BUILDRULES.py +++ b/icu4c/source/test/testdata/BUILDRULES.py @@ -8,8 +8,6 @@ from buildtool.request_types import * def generate(config, glob, common_vars): - build_dirs = ["{OUT_DIR}", "{TMP_DIR}"] - requests = [] requests += generate_rb(config, glob, common_vars) requests += generate_sprep(config, glob, common_vars) @@ -26,7 +24,7 @@ def generate(config, glob, common_vars): ) ] - return (build_dirs, requests) + return requests def generate_rb(config, glob, common_vars):