ICU-6889 Add test for CharsetDetector.detectAll() producing the same encoding multipl...

author Andy Heninger <andy.heninger@gmail.com>

Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

index fb3a65bc5a51f6bbd7810a72a7c00002f4ffc05e..71c5fb2adae971e923516764c0c7da3a6a0e2ff8 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@@ -11,6 +11,7 @@ import java.io.ByteArrayOutputStream;
  import java.io.InputStream;
  import java.io.Reader;
  import java.io.UnsupportedEncodingException;
+import java.util.HashSet;
  
  import javax.xml.parsers.DocumentBuilder;
  import javax.xml.parsers.DocumentBuilderFactory;
@@ -1104,7 +1105,30 @@ public class TestCharsetDetector extends TestFmwk
  
          name1 = match1.getName();
          assertEquals("Wrong charset name after running a second charset detector", "windows-1252", name1);
-
+    }
+    
+    public void TestBug6889() {
+        // Verify that CharsetDetector.detectAll() does not return the same encoding multiple times.
+        String text =
+            "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
+        byte[] textBytes;
+        try {
+            textBytes = text.getBytes("ISO-8859-1");
+        }
+        catch (Exception e) {
+            fail("Unexpected exception " + e.toString());
+            return;
+        }
+        
+        CharsetDetector det = new CharsetDetector();
+        det.setText(textBytes);
+        CharsetMatch matches[] = det.detectAll();
+        
+        HashSet<String> detectedEncodings = new HashSet<String>();
+        for (CharsetMatch m: matches) {
+            assertTrue("Charset " + m.getName() + " encountered before",
+                        detectedEncodings.add(m.getName()));
+        }   
      }
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Tue, 5 Jun 2012 17:40:59 +0000 (17:40 +0000)