]> granicus.if.org Git - icu/commitdiff
ICU-20693 Adding support for deletion of existing files prior to ICU data generation.
authorDavid Beaumont <dbeaumont@google.com>
Thu, 12 Sep 2019 11:58:39 +0000 (13:58 +0200)
committerDavid Beaumont <david.beaumont+github@gmail.com>
Wed, 6 Nov 2019 22:47:52 +0000 (23:47 +0100)
tools/cldr/cldr-to-icu/build-icu-data.xml
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java [new file with mode: 0644]
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java

index 82b00f331ac175492f334c02c0b32244ad08ba0a..aa4bd4b1c8208cc8ef82db784dd0da2495f07f8c 100644 (file)
 <!-- TODO: Add things like copying of a template directory and deleting previous files
      (perhaps always generate into a temporary directory and copy back to avoid having
       inconsistent state when the conversion is cancelled). -->
-<project name="Convert" default="convert" basedir=".">
+<project name="Convert" default="all" basedir=".">
+
+    <target name="all" depends="init-args, prepare-jar, clean, convert"/>
+
     <!-- Initialize the properties which were not already set on the command line. -->
     <target name="init-args">
         <property environment="env"/>
              since some supplemental data is also written to the curr/ directory.
 
              See LdmlConverter.OutputType for the full list of valid types. -->
-        <!-- TODO: Find out what people actually want here and switch to that. -->
+        <!-- TODO: Find out what common use cases are and use them. -->
         <property name="outputTypes" value=""/>
+
+        <!-- Override to force the 'clean' task to delete files it cannot determine to be
+             auto-generated by this tool. This is useful if the file header changes since
+             the heading is what's used to recognize auto-generated files. -->
+        <property name="forceDelete" value="false"/>
     </target>
 
     <!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
                           locales="xx,yy_ZZ"/> -->
         </convert>
     </target>
+
+    <target name="clean" depends="init-args, prepare-jar">
+        <taskdef name="outputDirectories" classname="org.unicode.icu.tool.cldrtoicu.ant.CleanOutputDirectoryTask">
+            <classpath>
+                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
+            </classpath>
+        </taskdef>
+
+        <outputDirectories root="${outDir}" forceDelete="${forceDelete}">
+            <dir name="brkitr">
+                <retain path="dictionaries"/>
+                <retain path="rules"/>
+            </dir>
+            <dir name="coll">
+                <retain path="de__PHONEBOOK.txt"/>
+                <retain path="de_.txt"/>
+                <retain path="es__TRADITIONAL.txt"/>
+                <retain path="es_.txt"/>
+            </dir>
+            <dir name="curr"/>
+            <dir name="lang"/>
+            <dir name="locales"/>
+            <dir name="misc">
+                <retain path="currencyNumericCodes.txt"/>
+                <retain path="icudata.rc"/>
+                <retain path="icustd.txt"/>
+                <retain path="icuver.txt"/>
+                <retain path="langInfo.txt"/>
+                <retain path="zoneinfo64.txt"/>
+                <!-- This file should be removed before the next release. -->
+                <retain path="miscfiles.mk"/>
+            </dir>
+            <dir name="rbnf"/>
+            <dir name="region"/>
+            <dir name="translit">
+                <retain path="en.txt"/>
+                <retain path="el.txt"/>
+                <!-- This file should be removed before the next release. -->
+                <retain path="trnsfiles.mk"/>
+            </dir>
+            <dir name="unit"/>
+            <dir name="zone">
+                <retain path="tzdbNames.txt"/>
+            </dir>
+        </outputDirectories>
+    </target>
 </project>
+
index 18763299282c7e563a6f26dd01331fa3f7e3691f..a91ad264ec01cb15a2a457507b7940616a6214b2 100644 (file)
@@ -3,12 +3,16 @@
 package org.unicode.icu.tool.cldrtoicu;
 
 import static com.google.common.base.Preconditions.checkNotNull;
+import static java.nio.file.StandardOpenOption.CREATE;
+import static java.nio.file.StandardOpenOption.CREATE_NEW;
+import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
 import static java.util.stream.Collectors.joining;
 
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.Writer;
 import java.nio.file.Files;
+import java.nio.file.OpenOption;
 import java.nio.file.Path;
 import java.util.List;
 import java.util.regex.Matcher;
@@ -34,11 +38,18 @@ final class IcuTextWriter {
     private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
     private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
 
+    private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW };
+    private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING };
+
     /** Write a file in ICU data format with the specified header. */
-    static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
+    static void writeToFile(
+        IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) {
+
         try {
             Files.createDirectories(outDir);
-            try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
+            Path file = outDir.resolve(icuData.getName() + ".txt");
+            OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES;
+            try (Writer w = Files.newBufferedWriter(file, fileOptions);
                 PrintWriter out = new PrintWriter(w)) {
                 new IcuTextWriter(icuData).writeTo(out, header);
             }
index 6d2d600465b2d7bf74076202942603b81c9ca24f..5aef1d9f036a7eaff3ea249ff6636af2c893d41a 100644 (file)
@@ -33,7 +33,6 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -225,7 +224,7 @@ public final class LdmlConverter {
         this.localeTransformer =
             RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
                 IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
-        this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
+        this.fileHeader = readLinesFromResource("/ldml2icu_header.txt");
     }
 
     private void convertAll() {
@@ -237,9 +236,9 @@ public final class LdmlConverter {
         }
     }
 
-    private static List<String> readLinesFromResource(String name) {
+    private static ImmutableList<String> readLinesFromResource(String name) {
         try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
-            return CharStreams.readLines(new InputStreamReader(in));
+            return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in)));
         } catch (IOException e) {
             throw new RuntimeException("cannot read resource: " + name, e);
         }
@@ -336,7 +335,7 @@ public final class LdmlConverter {
 
                 if (!splitData.getPaths().isEmpty() || isBaseLanguage || dir.includeEmpty()) {
                     splitData.setVersion(CldrDataSupplier.getCldrVersionString());
-                    write(splitData, outDir);
+                    write(splitData, outDir, false);
                     writtenLocaleIds.put(dir, id);
                 }
             }
@@ -472,7 +471,7 @@ public final class LdmlConverter {
 
             case TRANSFORMS:
                 Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
-                write(TransformsMapper.process(src, transformDir, fileHeader), transformDir);
+                write(TransformsMapper.process(src, transformDir, fileHeader), transformDir, false);
                 break;
 
             case KEY_TYPE_DATA:
@@ -502,7 +501,9 @@ public final class LdmlConverter {
     private void writeAliasFile(String srcId, String destId, Path dir) {
         IcuData icuData = new IcuData(srcId, true);
         icuData.add(RB_ALIAS, destId);
-        write(icuData, dir);
+        // Allow overwrite for aliases since some are "forced" and overwrite existing targets.
+        // TODO: Maybe tighten this up so only forced aliases for existing targets are overwritten.
+        write(icuData, dir, true);
     }
 
     private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
@@ -516,16 +517,16 @@ public final class LdmlConverter {
             // which is itself not in the set of written ICU files. An "indirect alias target".
             icuData.setVersion(CldrDataSupplier.getCldrVersionString());
         }
-        write(icuData, dir);
+        write(icuData, dir, false);
     }
 
     private void write(IcuData icuData, String dir) {
-        write(icuData, config.getOutputDir().resolve(dir));
+        write(icuData, config.getOutputDir().resolve(dir), false);
     }
 
-    private void write(IcuData icuData, Path dir) {
+    private void write(IcuData icuData, Path dir, boolean allowOverwrite) {
         createDirectory(dir);
-        IcuTextWriter.writeToFile(icuData, dir, fileHeader);
+        IcuTextWriter.writeToFile(icuData, dir, fileHeader, allowOverwrite);
     }
 
     private Path createDirectory(Path dir) {
index de7252c518237b91c65909042c8c44e60310fad6..d319527c593560cf9bd9ed7f974bce2ae7b92fef 100644 (file)
@@ -42,7 +42,7 @@ public interface LdmlConverterConfig {
         }
 
         /** Returns the relative output directory name. */
-        String getOutputDir() {
+        public String getOutputDir() {
             return dirName;
         }
 
@@ -51,7 +51,7 @@ public interface LdmlConverterConfig {
          * the supported set of locales for the "service" provided by the data in that
          * directory).
          */
-        // TODO: Document why there's a difference between directories for empty directories.
+        // TODO: Document why there's a difference between directories for empty files.
         boolean includeEmpty() {
             return includeEmpty;
         }
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java
new file mode 100644 (file)
index 0000000..ddd34ea
--- /dev/null
@@ -0,0 +1,281 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.ant;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableSet.toImmutableSet;
+import static java.nio.file.LinkOption.NOFOLLOW_LINKS;
+import static java.util.stream.Collectors.joining;
+import static java.util.stream.Collectors.partitioningBy;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.tools.ant.BuildException;
+import org.apache.tools.ant.Task;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.io.CharStreams;
+
+// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
+public final class CleanOutputDirectoryTask extends Task {
+    private static final ImmutableSet<String> ALLOWED_DIRECTORIES =
+        Stream
+            .concat(
+                Stream.of("misc", "translit"),
+                Arrays.stream(IcuLocaleDir.values()).map(IcuLocaleDir::getOutputDir))
+            .sorted()
+            .collect(toImmutableSet());
+
+    private static final CharMatcher NOT_WHITESPACE = CharMatcher.whitespace().negate();
+
+    private Path root = null;
+    private boolean forceDelete = false;
+    private final List<Dir> outputDirs = new ArrayList<>();
+    private final ImmutableList<String> headerLines;
+
+    public CleanOutputDirectoryTask() {
+        // TODO: Consider passing in header lines via Ant?
+        this.headerLines = readLinesFromResource("/ldml2icu_header.txt");
+    }
+
+    public static final class Retain extends Task {
+        private Path path = null;
+
+        // Don't use "Path" for the argument type because that always makes an absolute path (e.g.
+        // relative to the working directory for the Ant task). We want relative paths.
+        @SuppressWarnings("unused")
+        public void setPath(String path) {
+            Path p = Paths.get(path).normalize();
+            checkBuild(!p.isAbsolute() && !p.startsWith(".."), "invalid path: %s", path);
+            this.path = p;
+        }
+
+        @Override
+        public void init() throws BuildException {
+            checkBuild(path != null, "missing 'path' attribute");
+        }
+    }
+
+    public static final class Dir extends Task {
+        private String name;
+        private final Set<Path> retained = new HashSet<>();
+
+        @SuppressWarnings("unused")
+        public void setName(String name) {
+            checkBuild(ALLOWED_DIRECTORIES.contains(name),
+                "unknown directory name '%s'; allowed values: %s", name, ALLOWED_DIRECTORIES);
+            this.name = name;
+        }
+
+        @SuppressWarnings("unused")
+        public void addConfiguredRetain(Retain retain) {
+            retained.add(retain.path);
+        }
+
+        @Override
+        public void init() throws BuildException {
+            checkBuild(name != null, "missing 'name' attribute");
+        }
+    }
+
+    @SuppressWarnings("unused")
+    public void setRoot(Path root) {
+        this.root = root;
+    }
+
+    @SuppressWarnings("unused")
+    public void setForceDelete(boolean forceDelete) {
+        this.forceDelete = forceDelete;
+    }
+
+    @SuppressWarnings("unused")
+    public void addConfiguredDir(Dir dir) {
+        outputDirs.add(dir);
+    }
+
+    @Override
+    public void execute() throws BuildException {
+        checkBuild(root != null, "missing 'root' attribute");
+        checkBuild(!outputDirs.isEmpty(), "missing <dir> elements");
+
+        if (!Files.exists(root)) {
+            log("Root directory '" + root + "' does not exist (nothing to clean)");
+            return;
+        }
+        checkBuild(Files.isDirectory(root), "specified root '%s' is not a directory", root);
+
+        Set<Path> autogenFiles = new TreeSet<>();
+        Set<Path> unknownFiles = new TreeSet<>();
+        for (Dir dirInfo : outputDirs) {
+            Path dirPath = root.resolve(dirInfo.name);
+            if (!Files.exists(dirPath)) {
+                continue;
+            }
+            checkBuild(Files.isDirectory(dirPath), "'%s' is not a directory", dirPath);
+
+            // Note: For now we just walk the immediate contents of each output directory and don't
+            // attempt to recursively process things. Only a couple of output directories have
+            // sub-directories anyway, and we never write files into them anyway.
+            try (Stream<Path> files = Files.list(dirPath)) {
+                Map<Boolean, List<Path>> map = files
+                    .filter(p -> couldDelete(p, dirPath, dirInfo))
+                    .parallel()
+                    .collect(partitioningBy(this::wasAutoGenerated));
+                unknownFiles.addAll(map.get(false));
+                autogenFiles.addAll(map.get(true));
+            } catch (IOException e) {
+                throw new BuildException("Error processing directory: " + dirPath, e);
+            }
+        }
+
+        if (!unknownFiles.isEmpty() && !forceDelete) {
+            // If there are NO safe files, then something weird is going on (perhaps a change in
+            // the header file).
+            if (autogenFiles.isEmpty()) {
+                log("Error determining 'safe' files for deletion (no auto-generated files found).");
+                log(unknownFiles.size() + " files would be deleted for 'clean' task");
+                logPartioned(unknownFiles);
+                log("Set '-DforceDelete=true' to delete all files not listed in"
+                    + " <outputDirectories>.");
+            } else {
+                // A mix of safe and unsafe files is weird, but in this case it should be a
+                // relatively small number of files (e.g. adding a new manually maintained file or
+                // accidental editing of header lines).
+                log("Unknown files exist which cannot be determined to be auto-generated");
+                log("Files:");
+                logPartioned(unknownFiles);
+                log(String.format("%d unknown files or directories found", unknownFiles.size()));
+                log("Set '-DforceDelete=true' to delete these files, or add them to"
+                    + " <outputDirectories>.");
+            }
+            throw new BuildException("Unsafe files cannot be deleted");
+        }
+        if (!unknownFiles.isEmpty()) {
+            checkState(forceDelete, "unexpected flag state (forceDelete should be true here)");
+            List<Path> filesToDelete =
+                unknownFiles.stream()
+                    .filter(p -> !Files.isDirectory(p))
+                    .collect(Collectors.toList());
+            log(String.format("Force deleting %,d files...\n", filesToDelete.size()));
+            deleteAllFiles(filesToDelete);
+
+            List<Path> unknownDirs =
+                unknownFiles.stream()
+                    .filter(p -> Files.isDirectory(p))
+                    .collect(Collectors.toList());
+            if (!unknownDirs.isEmpty()) {
+                log("Add the following directories to the <outputDirectories> task:");
+                logPartioned(unknownDirs);
+                throw new BuildException("Unsafe directories cannot be deleted");
+            }
+        }
+        if (!autogenFiles.isEmpty()) {
+            log(String.format("Deleting %,d auto-generated files...\n", autogenFiles.size()));
+            deleteAllFiles(autogenFiles);
+        }
+    }
+
+    private void logPartioned(Iterable<Path> files) {
+        Iterables.partition(files, 5)
+            .forEach(f -> log(
+                f.stream().map(p -> root.relativize(p).toString()).collect(joining(", "))));
+    }
+
+    private boolean couldDelete(Path path, Path dir, Dir dirInfo) {
+        return !dirInfo.retained.contains(dir.relativize(path));
+    }
+
+    private boolean wasAutoGenerated(Path path) {
+        if (!Files.isRegularFile(path, NOFOLLOW_LINKS)) {
+            // Directories, symbolic links, devices etc.
+            return false;
+        }
+        try (BufferedReader r = Files.newBufferedReader(path)) {
+            // A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just
+            // treat it as optional everywhere (it's not the important thing we check here).
+            r.mark(1);
+            int maybeByteOrderMark = r.read();
+            if (maybeByteOrderMark != '\uFEFF') {
+                // Also reset if the file was empty, but that should be harmless.
+                r.reset();
+            }
+            for (String headerLine : headerLines) {
+                String line = r.readLine();
+                if (line == null) {
+                    return false;
+                }
+                int headerStart = skipComment(line);
+                if (headerStart < 0
+                    || !line.regionMatches(headerStart, headerLine, 0, headerLine.length())) {
+                    return false;
+                }
+            }
+            return true;
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private static int skipComment(String line) {
+        if (line.startsWith("#")) {
+            return toCommentStart(line, 1);
+        } else if (line.startsWith("//")) {
+            return toCommentStart(line, 2);
+        }
+        return -1;
+    }
+
+    // Not just "index-of" since a comment start followed by only whitespace is NOT a failure to
+    // find a comment (since the header might have an empty line in it, which should be okay).
+    private static int toCommentStart(String line, int offset) {
+        int index = NOT_WHITESPACE.indexIn(line, offset);
+        return index >= 0 ? index : line.length();
+    }
+
+    private static void deleteAllFiles(Iterable<Path> files) {
+        for (Path p : files) {
+            try {
+                // This is a code error, since only files should be passed here.
+                checkArgument(!Files.isDirectory(p), "Cannot delete directories: %s", p);
+                Files.deleteIfExists(p);
+            } catch (IOException e) {
+                throw new BuildException("Error deleting file: " + p, e);
+            }
+        }
+    }
+
+    private static void checkBuild(boolean condition, String message, Object... args) {
+        if (!condition) {
+            throw new BuildException(String.format(message, args));
+        }
+    }
+
+    private static ImmutableList<String> readLinesFromResource(String name) {
+        try (InputStream in = CleanOutputDirectoryTask.class.getResourceAsStream(name)) {
+            return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in)));
+        } catch (IOException e) {
+            throw new RuntimeException("cannot read resource: " + name, e);
+        }
+    }
+}
index e0844e653ddfb0d74457777bd797ab68654109e0..ece1aaa62e2ea55ddfd0f23c75730fc9a1552fc6 100644 (file)
@@ -4,8 +4,7 @@ package org.unicode.icu.tool.cldrtoicu.mapper;
 
 import static com.google.common.base.CharMatcher.whitespace;
 import static com.google.common.base.Preconditions.checkNotNull;
-import static java.nio.file.StandardOpenOption.CREATE;
-import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
+import static java.nio.file.StandardOpenOption.CREATE_NEW;
 import static org.unicode.cldr.api.AttributeKey.keyOf;
 import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
 import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
@@ -86,7 +85,8 @@ public final class TransformsMapper {
         Function<Path, PrintWriter> fileWriterFn = p -> {
             Path file = ruleFileOutputDir.resolve(p);
             try {
-                return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
+                // Specify "CREATE_NEW" since we don't want to overwrite any existing files.
+                return new PrintWriter(Files.newBufferedWriter(file, CREATE_NEW));
             } catch (IOException e) {
                 throw new RuntimeException("error opening file: " + file, e);
             }