From 5fc2d494f853bb94f6f3012c552766f8ea1a6a74 Mon Sep 17 00:00:00 2001
From: Peter Edberg <pedberg@unicode.org>
Date: Tue, 9 Sep 2014 03:20:53 +0000
Subject: [PATCH] ICU-10326 Add dictionary-based word/line break for
 Burmese/Myanmar (J), logKnownIssue #11245

X-SVN-Rev: 36401
---
 .gitattributes                                |   1 +
 .../com/ibm/icu/text/BurmeseBreakEngine.java  | 225 ++++++++++++++++++
 .../ibm/icu/text/RuleBasedBreakIterator.java  |   3 +
 icu4j/main/shared/data/icudata.jar            |   4 +-
 icu4j/main/shared/data/testdata.jar           |   2 +-
 .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java |  13 +-
 .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt |  13 +
 7 files changed, 257 insertions(+), 4 deletions(-)
 create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java

diff --git a/.gitattributes b/.gitattributes
index 7964079079e..9cf81a418ed 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -427,6 +427,7 @@ icu4j/main/classes/core/.settings/org.eclipse.jdt.core.prefs -text
 icu4j/main/classes/core/manifest.stub -text
 icu4j/main/classes/core/src/com/ibm/icu/impl/TZDBTimeZoneNames.java -text
 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java -text
+icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java -text
 icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java -text
 icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java -text
 icu4j/main/classes/currdata/.externalToolBuilders/copy-data-currdata.launch -text
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
new file mode 100644
index 00000000000..f70e8bf8d2b
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
@@ -0,0 +1,225 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2014, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.io.IOException;
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+
+class BurmeseBreakEngine extends DictionaryBreakEngine {
+    
+    // Constants for BurmeseBreakIterator
+    // How many words in a row are "good enough"?
+    private static final byte BURMESE_LOOKAHEAD = 3;
+    // Will not combine a non-word with a preceding dictionary word longer than this
+    private static final byte BURMESE_ROOT_COMBINE_THRESHOLD = 3;
+    // Will not combine a non-word that shares at least this much prefix with a
+    // dictionary word with a preceding word
+    private static final byte BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
+    // Minimum word size
+    private static final byte BURMESE_MIN_WORD = 2;
+    
+    private DictionaryMatcher fDictionary;
+    private static UnicodeSet fBurmeseWordSet;
+    private static UnicodeSet fEndWordSet;
+    private static UnicodeSet fBeginWordSet;
+    private static UnicodeSet fMarkSet;
+    
+    static {
+        // Initialize UnicodeSets
+        fBurmeseWordSet = new UnicodeSet();
+        fMarkSet = new UnicodeSet();
+        fBeginWordSet = new UnicodeSet();
+
+        fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
+        fBurmeseWordSet.compact();
+
+        fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
+        fMarkSet.add(0x0020);
+        fEndWordSet = new UnicodeSet(fBurmeseWordSet);
+        fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
+
+        // Compact for caching
+        fMarkSet.compact();
+        fEndWordSet.compact();
+        fBeginWordSet.compact();
+        
+        // Freeze the static UnicodeSet
+        fBurmeseWordSet.freeze();
+        fMarkSet.freeze();
+        fEndWordSet.freeze();
+        fBeginWordSet.freeze();
+    }
+    
+    public BurmeseBreakEngine() throws IOException {
+        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
+        setCharacters(fBurmeseWordSet);
+        // Initialize dictionary
+        fDictionary = DictionaryData.loadDictionaryFor("Mymr");
+    }
+
+    public boolean equals(Object obj) {
+        // Normally is a singleton, but it's possible to have duplicates
+        //   during initialization. All are equivalent.
+        return obj instanceof BurmeseBreakEngine;
+    }
+
+    public int hashCode() {
+        return getClass().hashCode();
+    }
+    
+    public boolean handles(int c, int breakType) {
+        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
+            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+            return (script == UScript.MYANMAR);
+        }
+        return false;
+    }
+
+    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
+            DequeI foundBreaks) {
+        
+        
+        if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
+            return 0;  // Not enough characters for word
+        }
+        int wordsFound = 0;
+        int wordLength;
+        int current;
+        PossibleWord words[] = new PossibleWord[BURMESE_LOOKAHEAD];
+        for (int i = 0; i < BURMESE_LOOKAHEAD; i++) {
+            words[i] = new PossibleWord();
+        }
+        int uc;
+
+        fIter.setIndex(rangeStart);
+        while ((current = fIter.getIndex()) < rangeEnd) {
+            wordLength = 0;
+
+            //Look for candidate words at the current position
+            int candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
+
+            // If we found exactly one, use that
+            if (candidates == 1) {
+                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
+                wordsFound += 1;
+            }
+
+            // If there was more than one, see which one can take us forward the most words
+            else if (candidates > 1) {
+                boolean foundBest = false;
+                // If we're already at the end of the range, we're done
+                if (fIter.getIndex() < rangeEnd) {
+                    do {
+                        int wordsMatched = 1;
+                        if (words[(wordsFound+1)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
+                            if (wordsMatched < 2) {
+                                // Followed by another dictionary word; mark first word as a good candidate
+                                words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
+                                wordsMatched = 2;
+                            }
+
+                            // If we're already at the end of the range, we're done
+                            if (fIter.getIndex() >= rangeEnd) {
+                                break;
+                            }
+
+                            // See if any of the possible second words is followed by a third word
+                            do {
+                                // If we find a third word, stop right away
+                                if (words[(wordsFound+2)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
+                                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
+                                    foundBest = true;
+                                    break;
+                                }
+                            } while (words[(wordsFound+1)%BURMESE_LOOKAHEAD].backUp(fIter));
+                        }
+                    } while (words[wordsFound%BURMESE_LOOKAHEAD].backUp(fIter) && !foundBest);
+                }
+                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
+                wordsFound += 1;
+            }
+
+            // We come here after having either found a word or not. We look ahead to the
+            // next word. If it's not a dictionary word, we will combine it with the word we
+            // just found (if there is one), but only if the preceding word does not exceed
+            // the threshold.
+            // The text iterator should now be positioned at the end of the word we found.
+            if (fIter.getIndex() < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
+                // If it is a dictionary word, do nothing. If it isn't, then if there is
+                // no preceding word, or the non-word shares less than the minimum threshold
+                // of characters with a dictionary word, then scan to resynchronize
+                if (words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
+                        (wordLength == 0 || 
+                                words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
+                    // Look for a plausible word boundary
+                    int remaining = rangeEnd - (current + wordLength);
+                    int pc = fIter.current();
+                    int chars = 0;
+                    for (;;) {
+                        fIter.next();
+                        uc = fIter.current();
+                        chars += 1;
+                        if (--remaining <= 0) {
+                            break;
+                        }
+                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
+                            // Maybe. See if it's in the dictionary.
+                            int candidate = words[(wordsFound + 1) %BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
+                            fIter.setIndex(current + wordLength + chars);
+                            if (candidate > 0) {
+                                break;
+                            }
+                        }
+                        pc = uc;
+                    }
+
+                    // Bump the word count if there wasn't already one
+                    if (wordLength <= 0) {
+                        wordsFound += 1;
+                    }
+
+                    // Update the length with the passed-over characters
+                    wordLength += chars;
+                } else {
+                    // Backup to where we were for next iteration
+                    fIter.setIndex(current+wordLength);
+                }
+            }
+
+            // Never stop before a combining mark.
+            int currPos;
+            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
+                fIter.next();
+                wordLength += fIter.getIndex() - currPos;
+            }
+
+            // Look ahead for possible suffixes if a dictionary word does not follow.
+            // We do this in code rather than using a rule so that the heuristic
+            // resynch continues to function. For example, one of the suffix characters 
+            // could be a typo in the middle of a word.
+            // NOT CURRENTLY APPLICABLE TO BURMESE
+
+            // Did we find a word on this iteration? If so, push it on the break stack
+            if (wordLength > 0) {
+                foundBreaks.push(Integer.valueOf(current + wordLength));
+            }
+        }
+
+        // Don't return a break for the end of the dictionary range if there is one there
+        if (foundBreaks.peek() >= rangeEnd) {
+            foundBreaks.pop();
+            wordsFound -= 1;
+        }
+
+        return wordsFound;
+    }
+
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
index 0dd194f386d..65de4d4e954 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -1113,6 +1113,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
                 case UScript.LAO:
                     eng = new LaoBreakEngine();
                     break;
+                case UScript.MYANMAR:
+                    eng = new BurmeseBreakEngine();
+                    break;
                 case UScript.KHMER:
                     eng = new KhmerBreakEngine();
                     break;
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar
index 1f1c61ca78f..d9c4dd13d3f 100755
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:391473d52c38a476dd5629eb8f409a867d7e31b7bdfce333a288d4a01dfed280
-size 11619130
+oid sha256:a27318a8ad0493a3960fdef1c6f169594722a03d812977640c7af973c78d6a99
+size 11792149
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar
index ce0f1c4769f..af882da93e7 100755
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71a3084b6e262dd9b01baacc71f91bf8341a63381baf79988a2a2cf1c9088f4a
+oid sha256:b13f033278b26b969e433a5da6752fd2f9bd496a3c04813ee75998b1174c3971
 size 812411
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
index 25e2d067890..6c3becd8e66 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -1,6 +1,6 @@
 /*
  *******************************************************************************
- * Copyright (C) 2003-2013 International Business Machines Corporation and
+ * Copyright (C) 2003-2014 International Business Machines Corporation and
  * others. All Rights Reserved.
  *******************************************************************************
  */
@@ -1848,6 +1848,17 @@ void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int
             }
 
 
+            // Exclude Myanmar from tests, it is dictionary-based. Not sure how this is handled
+            // for other script with dictionary break, but it is not working for Myanmar.
+            if (errorType != null && errorType.equals("next()") && name.equals("line")) {
+                int cBefore = UTF16.charAt(testText, i-1);
+                int cAfter = UTF16.charAt(testText, i);
+                if (cBefore >= 0x1000 && cBefore <= 0x109F && cAfter >= 0x1000 && cAfter <= 0x109F &&
+                        logKnownIssue("11245", "Skip errors for unexpected line breaks between Myanmar characters")) {
+                    errorType = null;
+                }
+            }
+
             if (errorType != null) {
                 // Format a range of the test text that includes the failure as
                 //  a data item that can be included in the rbbi test data file.
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
index 7e10d0c18ed..9bcac3a7217 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -718,6 +718,19 @@ Bangkok)â¢</data>
 <data>â¢à»àºàº»à»àº²â¢à»àº§àº»à»àº²â¢àºàº²àºªàº²â¢àº­àº±àºàºàº´àºâ¢à»àºà»â¢àºà»à»â¢</data>
 <data>â¢àºàº°àº¥àº¸àºàº²â¢à»àº§àº»à»àº²â¢àºà»àº²â¢à»â¢</data>
 
+##########################################################################################
+#
+#   Burmese/Myanmar Tests
+#
+##########################################################################################
+<locale en>
+# Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/udhr_mya.txt)
+<line>
+<data>â¢áá°â¢áá­á¯ááºá¸â¢áááº â¢áá°áá® â¢áá½ááºáááºâ¢áá±á¬ â¢áá¯ááºâ¢áá­â¢áá¹â¢áá«â¢áá¼â¢ááºá· â¢áááºá¸áá±á¬ááºá¸á â¢</data>
+<data>â¢áá°áá®â¢áá½ááºáááºâ¢áá±á¬ â¢á¡â¢áá½â¢ááºá·â¢á¡áá±á¸â¢áá»á¬á¸â¢áá¼â¢ááºá· â¢áááºá¸áá±á¬ááºá¸á â¢áá½á±á¸â¢áá½á¬á¸â¢áá¬â¢áá°áá»á¬á¸ â¢áá¼ááºáááºáâ¢</data>
+<data>â¢áá­á¯â¢áá°â¢áá­á¯á·á â¢áá­á¯ááºá¸áá¼á¬á¸ â¢áá±áááºâ¢áááºâ¢áá±á¬ â¢áá¬ááºâ¢áá¾â¢ááºá· â¢áá»â¢ááºá·â¢áááº â¢áá­áááºâ¢áá±á¬ â¢áá­ááºâ¢áá­á¯á·â¢áá¾á­â¢áá¼á â¢</data>
+<data>â¢áá­á¯â¢áá°â¢áá­á¯á·áááº â¢á¡áá»ááºá¸áá»ááºá¸ â¢áá±áá¹áá¬â¢áá¬á¸á â¢áááºáá¶â¢áá»â¢ááºá·â¢áá¯á¶á¸â¢</data>
+
 ##########################################################################################
 #
 #   Khmer Tests
-- 
2.40.0