ICU-10923 Python logic for resource filters.

author Shane Carr <shane@unicode.org>

Thu, 13 Dec 2018 07:55:13 +0000 (23:55 -0800)

committer Shane F. Carr <shane@unicode.org>

Tue, 18 Dec 2018 01:11:25 +0000 (17:11 -0800)
author Shane Carr <shane@unicode.org>
Thu, 13 Dec 2018 07:55:13 +0000 (23:55 -0800)
committer Shane F. Carr <shane@unicode.org>
Tue, 18 Dec 2018 01:11:25 +0000 (17:11 -0800)
diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py

index a3334d5d72ed60ca14c4c7dbbdda77368e734c8e..f305bd589a72fcec2194630eca80cdd1b940dfad 100644 (file)
--- a/icu4c/source/data/BUILDRULES.py
+++ b/icu4c/source/data/BUILDRULES.py
@@ -21,31 +21,6 @@ def generate(config, glob, common_vars):
          print("Error: Cannot find data directory; please specify --glob_dir", file=sys.stderr)
          exit(1)
  
-    # DIRECTORIES
-    build_dirs = [
-        "{OUT_DIR}",
-        "{OUT_DIR}/curr",
-        "{OUT_DIR}/lang",
-        "{OUT_DIR}/region",
-        "{OUT_DIR}/zone",
-        "{OUT_DIR}/unit",
-        "{OUT_DIR}/brkitr",
-        "{OUT_DIR}/coll",
-        "{OUT_DIR}/rbnf",
-        "{OUT_DIR}/translit",
-        "{TMP_DIR}",
-        "{TMP_DIR}/curr",
-        "{TMP_DIR}/lang",
-        "{TMP_DIR}/locales",
-        "{TMP_DIR}/region",
-        "{TMP_DIR}/zone",
-        "{TMP_DIR}/unit",
-        "{TMP_DIR}/coll",
-        "{TMP_DIR}/rbnf",
-        "{TMP_DIR}/translit",
-        "{TMP_DIR}/brkitr"
-    ]
-
      requests += generate_cnvalias(config, glob, common_vars)
      requests += generate_confusables(config, glob, common_vars)
      requests += generate_conversion_mappings(config, glob, common_vars)
@@ -154,7 +129,7 @@ def generate(config, glob, common_vars):
          )
      ]
  
-    return (build_dirs, requests)
+    return requests
  
  
  def generate_cnvalias(config, glob, common_vars):
diff --git a/icu4c/source/data/buildtool/__main__.py b/icu4c/source/data/buildtool/__main__.py

index 23e9572325183ad6cd768238b06cbf6c6301aa1e..7ef4d981bc977e37a603255d2892b2ed97e53106 100644 (file)
--- a/icu4c/source/data/buildtool/__main__.py
+++ b/icu4c/source/data/buildtool/__main__.py
@@ -8,9 +8,11 @@ from __future__ import print_function
  import argparse
  import glob as pyglob
  import json
+import os
  import sys
  
  from . import *
+from .comment_stripper import CommentStripper
  from .renderers import makefile, windirect
  from . import filtration, utils
  import BUILDRULES
@@ -118,15 +120,38 @@ class Config(object):
              try:
                  with open(args.filter_file, "r") as f:
                      print("Note: Applying filters from %s." % args.filter_file, file=sys.stderr)
-                    try:
-                        import hjson
-                        self.filters_json_data = hjson.load(f)
-                    except ImportError:
-                        self.filters_json_data = json.load(f)
+                    self._parse_filter_file(f)
              except IOError:
                  print("Error: Could not read filter file %s." % args.filter_file, file=sys.stderr)
                  exit(1)
  
+    def _parse_filter_file(self, f):
+        # Use the Hjson parser if it is available; otherwise, use vanilla JSON.
+        try:
+            import hjson
+            self.filters_json_data = hjson.load(f)
+        except ImportError:
+            self.filters_json_data = json.load(CommentStripper(f))
+
+        # Optionally pre-validate the JSON schema before further processing.
+        # Some schema errors will be caught later, but this step ensures
+        # maximal validity.
+        try:
+            import jsonschema
+            schema_path = os.path.join(os.path.dirname(__file__), "filtration_schema.json")
+            with open(schema_path) as schema_f:
+                schema = json.load(CommentStripper(schema_f))
+            validator = jsonschema.Draft4Validator(schema)
+            for error in validator.iter_errors(self.filters_json_data, schema):
+                print("WARNING: ICU data filter JSON file:", error.message,
+                    "at", "".join(
+                        "[%d]" % part if isinstance(part, int) else ".%s" % part
+                        for part in error.absolute_path
+                    ),
+                    file=sys.stderr)
+        except ImportError:
+            pass
+
      def has_feature(self, feature_name):
          assert feature_name in AVAILABLE_FEATURES
          return feature_name in self._feature_set
@@ -166,10 +191,12 @@ def main():
          # For the purposes of buildtool, force Unix-style directory separators.
          return [v.replace("\\", "/")[len(args.glob_dir)+1:] for v in sorted(result_paths)]
  
-    build_dirs, requests = BUILDRULES.generate(config, glob, common)
+    requests = BUILDRULES.generate(config, glob, common)
      requests = filtration.apply_filters(requests, config)
      requests = utils.flatten_requests(requests, config, common)
  
+    build_dirs = utils.compute_directories(requests)
+
      if args.format == "gnumake":
          print(makefile.get_gnumake_rules(
              build_dirs,
diff --git a/icu4c/source/data/buildtool/comment_stripper.py b/icu4c/source/data/buildtool/comment_stripper.py

new file mode 100644 (file)

index 0000000..4001f2f
--- /dev/null
+++ b/icu4c/source/data/buildtool/comment_stripper.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2018 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+
+import io
+
+class CommentStripper(object):
+    """Removes lines starting with "//" from a file stream."""
+
+    def __init__(self, f):
+        self.f = f
+        self.state = 0
+
+    def read(self, size=-1):
+        bytes = self.f.read(size)
+        # TODO: Do we need to read more bytes if comments were stripped
+        # in order to obey the size request?
+        return "".join(self._strip_comments(bytes))
+
+    def _strip_comments(self, bytes):
+        for byte in bytes:
+            if self.state == 0:
+                # state 0: start of a line
+                if byte == "/":
+                    self.state = 1
+                elif byte == "\n":
+                    self.state = 0
+                    yield byte
+                else:
+                    self.state = 2
+                    yield byte
+            elif self.state == 1:
+                # state 1: read a single '/'
+                if byte == "/":
+                    self.state = 3
+                elif byte == "\n":
+                    self.state = 0
+                    yield "/"  # the one that was skipped
+                    yield "\n"
+                else:
+                    self.state = 2
+                    yield "/"  # the one that was skipped
+                    yield byte
+            elif self.state == 2:
+                # state 2: middle of a line, no comment
+                if byte == "\n":
+                    self.state = 0
+                yield byte
+            elif self.state == 3:
+                # state 3: inside a comment
+                if byte == "\n":
+                    self.state = 0
diff --git a/icu4c/source/data/buildtool/filtration.py b/icu4c/source/data/buildtool/filtration.py

index e7a4c6380e6a226e3383851577a9f2e3afaa500d..26530581fbf51389e4362f2467c981276f79b3e5 100644 (file)
--- a/icu4c/source/data/buildtool/filtration.py
+++ b/icu4c/source/data/buildtool/filtration.py
@@ -50,6 +50,11 @@ class Filter(object):
          pass
  
  
+class InclusionFilter(Filter):
+    def match(self, file):
+        return True
+
+
  class ExclusionFilter(Filter):
      def match(self, file):
          return False
@@ -166,6 +171,122 @@ def _preprocess_file_filters(requests, config):
      return filters
  
  
-def _apply_resource_filters(old_requests, config):
+class ResourceFilterInfo(object):
+    def __init__(self, category):
+        self.category = category
+        self.filter_tmp_dir = "filters/%s" % category
+        self.input_files = None
+        self.filter_files = None
+        self.rules_by_file = None
+
+    def apply_to_requests(self, all_requests):
+        # Call this method only once per list of requests.
+        assert self.input_files is None
+        for request in all_requests:
+            if request.category != self.category:
+                continue
+            if not isinstance(request, AbstractExecutionRequest):
+                continue
+            if request.tool != IcuTool("genrb"):
+                continue
+            self._set_files(request.input_files)
+            # Add dependencies directly to dep_files
+            request.dep_files += self.filter_files
+            arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
+            request.args = "%s %s" % (arg_str, request.args)
+
+        # Make sure we found the target request
+        if self.input_files is None:
+            print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
+            self.input_files = []
+            self.filter_files = []
+            self.rules_by_file = []
+
+    def _set_files(self, files):
+        # Note: The input files to genrb for a certain category should always
+        # be the same. For example, there are often two genrb calls: one for
+        # --writePoolBundle, and the other for --usePoolBundle. They are both
+        # expected to have the same list of input files.
+        if self.input_files is not None:
+            assert self.input_files == files
+            return
+        self.input_files = list(files)
+        self.filter_files = [
+            TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
+            for basename in (
+                file.filename[file.filename.rfind("/")+1:]
+                for file in files
+            )
+        ]
+        self.rules_by_file = [[] for _ in range(len(files))]
+
+    def add_rules(self, file_filter, rules):
+        for file, rule_list in zip(self.input_files, self.rules_by_file):
+            if file_filter.match(file):
+                rule_list += rules
+
+    def make_requests(self):
+        # Map from rule list to filter files with that rule list
+        unique_rules = defaultdict(list)
+        for filter_file, rules in zip(self.filter_files, self.rules_by_file):
+            unique_rules[tuple(rules)].append(filter_file)
+
+        new_requests = []
+        i = 0
+        for rules, filter_files in unique_rules.items():
+            base_filter_file = filter_files[0]
+            new_requests += [
+                PrintFileRequest(
+                    name = "%s_print_%d" % (self.category, i),
+                    output_file = base_filter_file,
+                    content = self._generate_resource_filter_txt(rules)
+                )
+            ]
+            i += 1
+            for filter_file in filter_files[1:]:
+                new_requests += [
+                    CopyRequest(
+                        name = "%s_copy_%d" % (self.category, i),
+                        input_file = base_filter_file,
+                        output_file = filter_file
+                    )
+                ]
+                i += 1
+        return new_requests
+
+    @classmethod
+    def _generate_resource_filter_txt(cls, rules):
+        result = "# Caution: This file is automatically generated\n\n"
+        result += "\n".join(rules)
+        return result
+
+
+def _apply_resource_filters(all_requests, config):
      """Creates filters for looking within resource bundle files."""
-    return old_requests
+    json_data = config.filters_json_data
+    if "resourceFilters" not in json_data:
+        return all_requests
+
+    collected = {}
+    for entry in json_data["resourceFilters"]:
+        if "files" in entry:
+            file_filter = Filter.create_from_json(entry["files"])
+        else:
+            file_filter = InclusionFilter()
+        for category in entry["categories"]:
+            # not defaultdict because we need to pass arguments to the constructor
+            if category not in collected:
+                filter_info = ResourceFilterInfo(category)
+                filter_info.apply_to_requests(all_requests)
+                collected[category] = filter_info
+            else:
+                filter_info = collected[category]
+            filter_info.add_rules(file_filter, entry["rules"])
+
+    # Add the filter generation requests to the beginning so that by default
+    # they are made before genrb gets run (order is required by windirect)
+    new_requests = []
+    for filter_info in collected.values():
+        new_requests += filter_info.make_requests()
+    new_requests += all_requests
+    return new_requests
diff --git a/icu4c/source/data/buildtool/filtration_schema.json b/icu4c/source/data/buildtool/filtration_schema.json

new file mode 100644 (file)

index 0000000..619ae2a
--- /dev/null
+++ b/icu4c/source/data/buildtool/filtration_schema.json
@@ -0,0 +1,85 @@
+// Copyright (C) 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+{
+    "$id": "http://unicode.org/icu-filter-schema",
+    "$schema": "http://json-schema.org/draft-04/schema#",
+    "description": "JSON Schema for an ICU data filter file",
+    "type": "object",
+    "properties": {
+        "localeFilter": { "$ref": "#/definitions/filter" },
+        "featureFilters": {
+            "type": "object",
+            "additionalProperties": { "$ref": "#/definitions/filter" }
+        },
+        "resourceFilters": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "categories": {
+                        "type": "array",
+                        "items": { "type": "string" }
+                    },
+                    "files": { "$ref": "#/definitions/filter" },
+                    "rules": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "pattern": "^[+-]/(\\w+(/\\w+)*)?$"
+                        }
+                    }
+                },
+                "required": ["categories", "rules"],
+                "additionalProperties": false
+            }
+        }
+    },
+    "additionalProperties": false,
+    "definitions": {
+        "filter": {
+            "type": "object",
+            "oneOf": [
+                {
+                    "properties": {
+                        "filterType": { "$ref": "#/definitions/filterType" },
+                        "whitelist": { "$ref": "#/definitions/stringList" }
+                    },
+                    "required": ["whitelist"],
+                    "additionalProperties": false
+                },
+                {
+                    "properties": {
+                        "filterType": { "$ref": "#/definitions/filterType" },
+                        "blacklist": { "$ref": "#/definitions/stringList" }
+                    },
+                    "required": ["blacklist"],
+                    "additionalProperties": false
+                },
+                {
+                    "properties": {
+                        "filterType": { "$ref": "#/definitions/filterType" }
+                    },
+                    "additionalProperties": false
+                }
+            ]
+        },
+        "filterType": {
+            "type": "string",
+            "enum": [
+                "file-stem",
+                "language",
+                "regex",
+                "exclude"
+            ]
+        },
+        "stringList": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            },
+            "minItems": 1,
+            "uniqueItems": true
+        }
+    }
+}
diff --git a/icu4c/source/data/buildtool/request_types.py b/icu4c/source/data/buildtool/request_types.py

index 722d432232ec8f8fcf3aced6cb8f315bafa2c257..1890dd3c4ac8b9b5204dfdeedd78d98361760ddc 100644 (file)
--- a/icu4c/source/data/buildtool/request_types.py
+++ b/icu4c/source/data/buildtool/request_types.py
@@ -9,7 +9,7 @@ from abc import abstractmethod
  import copy
  import sys
  
-
+from . import *
  from . import utils
  
  
diff --git a/icu4c/source/data/buildtool/utils.py b/icu4c/source/data/buildtool/utils.py

index 8889f51a791711c888a462af1b2c3b379ccafefc..d072428e90faf50ef3c7b180600197e511d7fadc 100644 (file)
--- a/icu4c/source/data/buildtool/utils.py
+++ b/icu4c/source/data/buildtool/utils.py
@@ -86,6 +86,14 @@ def get_all_output_files(requests, include_tmp=False):
      return [f for _, f in set((type(f), f) for f in files)]
  
  
+def compute_directories(requests):
+    dirs = set()
+    for file in get_all_output_files(requests, include_tmp=True):
+        path = "%s/%s" % (dir_for(file), file.filename)
+        dirs.add(path[:path.rfind("/")])
+    return list(sorted(dirs))
+
+
  class SpaceSeparatedList(list):
      """A list that joins itself with spaces when converted to a string."""
      def __str__(self):
diff --git a/icu4c/source/test/testdata/BUILDRULES.py b/icu4c/source/test/testdata/BUILDRULES.py

index e959dda4eb7676851f6ddeeb38b2dc998e3d3910..4eb3ca3ff9a9066e53a9d03f4254d4c6e422ed92 100644 (file)
--- a/icu4c/source/test/testdata/BUILDRULES.py
+++ b/icu4c/source/test/testdata/BUILDRULES.py
@@ -8,8 +8,6 @@ from buildtool.request_types import *
  
  
  def generate(config, glob, common_vars):
-    build_dirs = ["{OUT_DIR}", "{TMP_DIR}"]
-
      requests = []
      requests += generate_rb(config, glob, common_vars)
      requests += generate_sprep(config, glob, common_vars)
@@ -26,7 +24,7 @@ def generate(config, glob, common_vars):
          )
      ]
  
-    return (build_dirs, requests)
+    return requests
  
  
  def generate_rb(config, glob, common_vars):
author	Shane Carr <shane@unicode.org>
	Thu, 13 Dec 2018 07:55:13 +0000 (23:55 -0800)
committer	Shane F. Carr <shane@unicode.org>
	Tue, 18 Dec 2018 01:11:25 +0000 (17:11 -0800)
icu4c/source/data/BUILDRULES.py		patch \| blob \| history
icu4c/source/data/buildtool/__main__.py		patch \| blob \| history
icu4c/source/data/buildtool/comment_stripper.py	[new file with mode: 0644]	patch \| blob
icu4c/source/data/buildtool/filtration.py		patch \| blob \| history
icu4c/source/data/buildtool/filtration_schema.json	[new file with mode: 0644]	patch \| blob
icu4c/source/data/buildtool/request_types.py		patch \| blob \| history
icu4c/source/data/buildtool/utils.py		patch \| blob \| history
icu4c/source/test/testdata/BUILDRULES.py		patch \| blob \| history