]> granicus.if.org Git - icu/commitdiff
ICU-21176 Add aliases for terms "whitelist" and "blacklist" in data filter
authorShane F. Carr <shane@unicode.org>
Tue, 11 Aug 2020 23:06:45 +0000 (18:06 -0500)
committerShane F. Carr <shane@unicode.org>
Wed, 12 Aug 2020 02:21:59 +0000 (21:21 -0500)
See #1189

.ci-builds/data-filter.json
docs/userguide/icu_data/buildtool.md
icu4c/source/python/icutools/databuilder/filtration.py
icu4c/source/python/icutools/databuilder/filtration_schema.json

index ffde99567d0db28801ba2527306f7370cb4cddab..0cfc8fffc3637e69a2955b8a1e88577a39605226 100644 (file)
@@ -8,18 +8,29 @@
     ]
   },
 // Test mixed feature filter and resource filter
-// Exlude translit data so we can run test for ICU-20673
+// Exclude translit data so we can run test for ICU-20673
+// Also test for "whitelist" versus "includelist"
   "featureFilters": {
     "misc": {
       "whitelist": ["supplementalData"]
     },
-    "translit": "exclude"
+    "translit": "exclude",
+    "curr_tree": {
+      "filterType": "locale",
+      "includelist": ["my"]
+    },
+    "brkitr_rules": {
+      "excludelist": ["line"]
+    },
+    "brkitr_dictionaries": {
+      "blacklist": ["cjdict"]
+    }
   },
   "resourceFilters": [
     {
       "categories": ["misc"],
       "files": {
-        "whitelist": ["supplementalData"]
+        "includelist": ["supplementalData"]
       },
       "rules": ["+/*"]
     }
index 7c3eadcc262f6544b851debf60a85687bc01eef2..ff03e4f1382a95a69748f9b5ed14c1c9aa1dca2a 100644 (file)
@@ -76,7 +76,7 @@ languages:
     {
       "localeFilter": {
         "filterType": "language",
-        "whitelist": [
+        "includelist": [
           "en",
           "de",
           "zh"
@@ -86,6 +86,11 @@ languages:
 
 The *filterType* "language" only supports slicing by entire languages.
 
+##### Terminology: Includelist, Excludelist, Whitelist, Blacklist
+
+Prior to ICU 68, use `"whitelist"` and `"blacklist"` instead of `"includelist"`
+and `"excludelist"`, respectively. ICU 68 allows all four terms.
+
 #### Filtering by Locale
 
 For more control, use *filterType* "locale".  Here is a *filters.hjson* file that
@@ -94,13 +99,15 @@ only the default script (e.g., Simplified Han for Chinese):
 
     localeFilter: {
       filterType: locale
-      whitelist: [
+      includelist: [
         en
         de
         zh
       ]
     }
 
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
 #### Adding Script Variants (includeScripts = true)
 
 You may set the *includeScripts* option to true to include all scripts for a
@@ -112,7 +119,7 @@ Chinese are included:
       "localeFilter": {
         "filterType": "locale",
         "includeScripts": true,
-        "whitelist": [
+        "includelist": [
           "en",
           "de",
           "zh"
@@ -120,6 +127,8 @@ Chinese are included:
       }
     }
 
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
 If you wish to explicitly list the scripts, you may put the script code in the
 locale tag in the whitelist, and you do not need the *includeScripts* option
 enabled.  For example, in Hjson, to include Han Traditional ***but not Han
@@ -127,14 +136,16 @@ Simplified***:
 
     localeFilter: {
       filterType: locale
-      whitelist: [
+      includelist: [
         en
         de
         zh_Hant
       ]
     }
 
-Note: the option *includeScripts* is only supported at the language level;
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
+**Note:** the option *includeScripts* is only supported at the language level;
 i.e., in order to include all scripts for a particular language, you must
 specify the language alone, without a region tag.
 
@@ -150,7 +161,7 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
     localeFilter: {
       filterType: locale
       includeChildren: false
-      whitelist: [
+      includelist: [
         en_US
         en_GB
         de_DE
@@ -158,6 +169,8 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
       ]
     }
 
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
 Including dependencies, the above filter would include the following data files:
 
 - root.txt
@@ -285,7 +298,7 @@ dictionaries:
 
     featureFilters: {
       brkitr_dictionaries: {
-        whitelist: [
+        includelist: [
           burmesedict
         ]
       }
@@ -295,7 +308,8 @@ Do *not* include directories or file extensions.  They will be added
 automatically for you.  Note that all files in a particular category have the
 same directory and extension.
 
-You can use either a whitelist or a blacklist for the file name filter.
+You can use either `"includelist"` or `"excludelist"` for the file name filter.
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
 
 ##### Regex Filter
 
@@ -305,7 +319,7 @@ To exclude filenames matching a certain regular expression, use *filterType*
     featureFilters: {
       brkitr_rules: {
         filterType: regex
-        blacklist: [
+        excludelist: [
           ^.*_cj$
         ]
       }
@@ -353,12 +367,14 @@ the common locales specified in *localeFilter*, you can do the following:
     featureFilters:
       curr_tree: {
         filterType: locale
-        whitelist: [
+        includelist: [
           it
         ]
       }
     }
 
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
 You can exclude an entire `_tree` category without affecting other categories.
 For example, to exclude region display names:
 
@@ -446,7 +462,7 @@ following (this example removes calendar data):
       {
         categories: ["misc"]
         files: {
-          whitelist: ["supplementalData"]
+          includelist: ["supplementalData"]
         }
         rules: [
           -/calendarData
@@ -454,6 +470,8 @@ following (this example removes calendar data):
       }
     ]
 
+*If using ICU 67 or earlier, see note above regarding allowed keywords.*
+
 #### Combining Multiple Resource Filter Specs
 
 You can also list multiple resource filter objects in the *resourceFilters*
@@ -474,7 +492,7 @@ en-CA; this also makes use of the *files* option:
         categories: ["unit_tree"]
         files: {
           filterType: locale
-          whitelist: ["en_US"]
+          includelist: ["en_US"]
         }
         rules: [
           +/*/length/mile
@@ -484,7 +502,7 @@ en-CA; this also makes use of the *files* option:
         categories: ["unit_tree"]
         files: {
           filterType: locale
-          whitelist: ["en_CA"]
+          includelist: ["en_CA"]
         }
         rules: [
           +/*/length/kilometer
index 554013ac98d67b28c653c55dc3a1470003998487..5ad5f50e22b3896eee488d443a377b7ce4caabc9 100644 (file)
@@ -78,15 +78,22 @@ class ExclusionFilter(Filter):
         return False
 
 
-class WhitelistBlacklistFilter(Filter):
+class IncludeExcludeFilter(Filter):
     def __init__(self, json_data):
         if "whitelist" in json_data:
-            self.is_whitelist = True
-            self.whitelist = json_data["whitelist"]
+            self.is_includelist = True
+            self.includelist = json_data["whitelist"]
+        elif "includelist" in json_data:
+            self.is_includelist = True
+            self.includelist = json_data["includelist"]
+        elif "blacklist" in json_data:
+            self.is_includelist = False
+            self.excludelist = json_data["blacklist"]
+        elif "excludelist" in json_data:
+            self.is_includelist = False
+            self.excludelist = json_data["excludelist"]
         else:
-            assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
-            self.is_whitelist = False
-            self.blacklist = json_data["blacklist"]
+            raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))
 
     def match(self, file):
         file_stem = self._file_to_file_stem(file)
@@ -97,43 +104,43 @@ class WhitelistBlacklistFilter(Filter):
         pass
 
 
-class FileStemFilter(WhitelistBlacklistFilter):
+class FileStemFilter(IncludeExcludeFilter):
     def _should_include(self, file_stem):
-        if self.is_whitelist:
-            return file_stem in self.whitelist
+        if self.is_includelist:
+            return file_stem in self.includelist
         else:
-            return file_stem not in self.blacklist
+            return file_stem not in self.excludelist
 
 
-class LanguageFilter(WhitelistBlacklistFilter):
+class LanguageFilter(IncludeExcludeFilter):
     def _should_include(self, file_stem):
         language = file_stem.split("_")[0]
         if language == "root":
             # Always include root.txt
             return True
-        if self.is_whitelist:
-            return language in self.whitelist
+        if self.is_includelist:
+            return language in self.includelist
         else:
-            return language not in self.blacklist
+            return language not in self.excludelist
 
 
-class RegexFilter(WhitelistBlacklistFilter):
+class RegexFilter(IncludeExcludeFilter):
     def __init__(self, *args):
         # TODO(ICU-20301): Change this to: super().__init__(*args)
         super(RegexFilter, self).__init__(*args)
-        if self.is_whitelist:
-            self.whitelist = [re.compile(pat) for pat in self.whitelist]
+        if self.is_includelist:
+            self.includelist = [re.compile(pat) for pat in self.includelist]
         else:
-            self.blacklist = [re.compile(pat) for pat in self.blacklist]
+            self.excludelist = [re.compile(pat) for pat in self.excludelist]
 
     def _should_include(self, file_stem):
-        if self.is_whitelist:
-            for pattern in self.whitelist:
+        if self.is_includelist:
+            for pattern in self.includelist:
                 if pattern.match(file_stem):
                     return True
             return False
         else:
-            for pattern in self.blacklist:
+            for pattern in self.excludelist:
                 if pattern.match(file_stem):
                     return False
             return True
@@ -159,7 +166,12 @@ LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
 
 class LocaleFilter(Filter):
     def __init__(self, json_data, io):
-        self.locales_requested = list(json_data["whitelist"])
+        if "whitelist" in json_data:
+            self.locales_requested = list(json_data["whitelist"])
+        elif "includelist" in json_data:
+            self.locales_requested = list(json_data["includelist"])
+        else:
+            raise AssertionError("You must have an includelist in a locale filter")
         self.include_children = json_data.get("includeChildren", True)
         self.include_scripts = json_data.get("includeScripts", False)
 
index 2b7ff9989992a12e77a81ad9d6f2a6418351f22e..3aed41a3341f654510aae34e8491c7e1d9d250ff 100644 (file)
@@ -90,7 +90,7 @@
                 {
                     "properties": {
                         "filterType": {
-                            "$ref": "#/definitions/blacklistWhitelistFilterTypes"
+                            "$ref": "#/definitions/includeExcludeFilterTypes"
                         },
                         "whitelist": { "$ref": "#/definitions/stringList" }
                     },
                 {
                     "properties": {
                         "filterType": {
-                            "$ref": "#/definitions/blacklistWhitelistFilterTypes"
+                            "$ref": "#/definitions/includeExcludeFilterTypes"
                         },
                         "blacklist": { "$ref": "#/definitions/stringList" }
                     },
                     "required": ["blacklist"],
                     "additionalProperties": false
                 },
+                {
+                    "properties": {
+                        "filterType": {
+                            "$ref": "#/definitions/includeExcludeFilterTypes"
+                        },
+                        "includelist": { "$ref": "#/definitions/stringList" }
+                    },
+                    "required": ["includelist"],
+                    "additionalProperties": false
+                },
+                {
+                    "properties": {
+                        "filterType": {
+                            "$ref": "#/definitions/includeExcludeFilterTypes"
+                        },
+                        "excludelist": { "$ref": "#/definitions/stringList" }
+                    },
+                    "required": ["excludelist"],
+                    "additionalProperties": false
+                },
                 {
                     "properties": {
                         "filterType": {
                     "required": ["filterType", "whitelist"],
                     "additionalProperties": false
                 },
+                {
+                    "properties": {
+                        "filterType": {
+                            "type": "string",
+                            "enum": ["locale"]
+                        },
+                        "includeChildren": {
+                            "type": "boolean"
+                        },
+                        "includeScripts": {
+                            "type": "boolean"
+                        },
+                        "includelist": { "$ref": "#/definitions/stringList" }
+                    },
+                    "required": ["filterType", "includelist"],
+                    "additionalProperties": false
+                },
                 {
                     "properties": {
                         "filterType": {
                 }
             ]
         },
-        "blacklistWhitelistFilterTypes": {
+        "includeExcludeFilterTypes": {
             "type": "string",
             "enum": [
                 "language",