#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode...

author Florent Xicluna <florent.xicluna@gmail.com>

Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)

committer Florent Xicluna <florent.xicluna@gmail.com>

Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)
author Florent Xicluna <florent.xicluna@gmail.com>
Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)
committer Florent Xicluna <florent.xicluna@gmail.com>
Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index 9c8989645be31b28ddddfd826d82cf02f7c6d111..4904f70b3a87983195d990e7f64e5601d4911fdf 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -24,7 +24,7 @@ class UnicodeMethodsTest(unittest.TestCase):
  
      def test_method_checksum(self):
          h = hashlib.sha1()
-        for i in range(65536):
+        for i in range(0x10000):
              char = unichr(i)
              data = [
                  # Predicates (single char)
@@ -282,6 +282,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
          self.assertEqual(u"\u01c5".title(), u"\u01c5")
          self.assertEqual(u"\u01c6".title(), u"\u01c5")
  
+    def test_linebreak_7643(self):
+        for i in range(0x10000):
+            lines = (unichr(i) + u'A').splitlines()
+            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
+                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
+                self.assertEqual(len(lines), 2,
+                                 r"\u%.4x should be a linebreak" % i)
+            else:
+                self.assertEqual(len(lines), 1,
+                                 r"\u%.4x should not be a linebreak" % i)
+
  def test_main():
      test.test_support.run_unittest(
          UnicodeMiscTest,
diff --git a/Misc/NEWS b/Misc/NEWS

index 7855dd55e56511f0389638eeb9ee23f2f6cb0d0a..0105bc3491eabe25d543c8479589b097c7160472 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -32,6 +32,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks
+  according to Unicode Standard Annex #14.
+  http://www.unicode.org/reports/tr14/
+
  - Comparisons using one of <, <=, >, >= between a complex instance and
    a Fractions instance now raise TypeError instead of returning
    True/False.  This makes Fraction <=> complex comparisons consistent with
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 4943413cdb2cd5a738642a13fae4355ef20c61ab..930d58c803c7f079c7925bd118aa3e456c651581 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -115,9 +115,9 @@ static char unicode_default_encoding[100];
  /* Fast detection of the most frequent whitespace characters */
  const unsigned char _Py_ascii_whitespace[] = {
      0, 0, 0, 0, 0, 0, 0, 0,
-/*     case 0x0009: * HORIZONTAL TABULATION */
+/*     case 0x0009: * CHARACTER TABULATION */
  /*     case 0x000A: * LINE FEED */
-/*     case 0x000B: * VERTICAL TABULATION */
+/*     case 0x000B: * LINE TABULATION */
  /*     case 0x000C: * FORM FEED */
  /*     case 0x000D: * CARRIAGE RETURN */
      0, 1, 1, 1, 1, 1, 0, 0,
@@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = {
  static unsigned char ascii_linebreak[] = {
      0, 0, 0, 0, 0, 0, 0, 0,
  /*         0x000A, * LINE FEED */
+/*         0x000B, * LINE TABULATION */
+/*         0x000C, * FORM FEED */
  /*         0x000D, * CARRIAGE RETURN */
-    0, 0, 1, 0, 0, 1, 0, 0,
+    0, 0, 1, 1, 1, 1, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0,
  /*         0x001C, * FILE SEPARATOR */
  /*         0x001D, * GROUP SEPARATOR */
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h

index 443693d3496693b34df527e97a2a927f35575ed0..d2ec46b0f81fd7efb787401bb983f264921165e9 100644 (file)
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -661,7 +661,7 @@ static unsigned char index1[] = {
  };
  
  static unsigned char index2[] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
      1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
      4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
@@ -3313,13 +3313,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
  #endif
  }
  
-/* Returns 1 for Unicode characters having the category 'Zl',
- * 'Zp' or type 'B', 0 otherwise.
+/* Returns 1 for Unicode characters having the line break
+ * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
+ * type 'B', 0 otherwise.
   */
  int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
  {
      switch (ch) {
      case 0x000A:
+    case 0x000B:
+    case 0x000C:
      case 0x000D:
      case 0x001C:
      case 0x001D:
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py

index a97d4caf58f0262d0d7f70e99ffdd8f350840c76..3f5ad51969af8ba0136bd4011b8f91cc159d24e1 100644 (file)
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -36,6 +36,7 @@ COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
  EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
  UNIHAN = "Unihan%s.txt"
  DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
+LINE_BREAK = "LineBreak%s.txt"
  
  old_versions = ["3.2.0"]
  
@@ -50,6 +51,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
  
  EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
  
+MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
+
  # note: should match definitions in Objects/unicodectype.c
  ALPHA_MASK = 0x01
  DECIMAL_MASK = 0x02
@@ -71,7 +74,8 @@ def maketables(trace=0):
                            COMPOSITION_EXCLUSIONS % version,
                            EASTASIAN_WIDTH % version,
                            UNIHAN % version,
-                          DERIVEDNORMALIZATION_PROPS % version)
+                          DERIVEDNORMALIZATION_PROPS % version,
+                          LINE_BREAK % version)
  
      print len(filter(None, unicode.table)), "characters"
  
@@ -113,7 +117,7 @@ def makeunicodedata(unicode, trace):
              bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
              mirrored = record[9] == "Y"
              eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
-            normalizationquickcheck = record[16]
+            normalizationquickcheck = record[17]
              item = (
                  category, combining, bidirectional, mirrored, eastasianwidth,
                  normalizationquickcheck
@@ -365,13 +369,14 @@ def makeunicodetype(unicode, trace):
              # extract database properties
              category = record[2]
              bidirectional = record[4]
+            properties = record[16]
              flags = 0
              delta = True
              if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                  flags |= ALPHA_MASK
              if category == "Ll":
                  flags |= LOWER_MASK
-            if category == "Zl" or bidirectional == "B":
+            if 'Line_Break' in properties or bidirectional == "B":
                  flags |= LINEBREAK_MASK
                  linebreaks.append(char)
              if category == "Zs" or bidirectional in ("WS", "B", "S"):
@@ -524,8 +529,9 @@ def makeunicodetype(unicode, trace):
      print >>fp
  
      # Generate code for _PyUnicode_IsLinebreak()
-    print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
-    print >>fp, " * 'Zp' or type 'B', 0 otherwise."
+    print >>fp, "/* Returns 1 for Unicode characters having the line break"
+    print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
+    print >>fp, " * type 'B', 0 otherwise."
      print >>fp, " */"
      print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
      print >>fp, '{'
@@ -787,6 +793,9 @@ def merge_old_version(version, new, old):
                      elif k == 14:
                          # change to simple titlecase mapping; ignore
                          pass
+                    elif k == 16:
+                        # change to properties; not yet
+                        pass
                      else:
                          class Difference(Exception):pass
                          raise Difference, (hex(i), k, old.table[i], new.table[i])
@@ -803,9 +812,15 @@ def merge_old_version(version, new, old):
  # load a unicode-data file from disk
  
  class UnicodeData:
+    # Record structure:
+    # [ID, name, category, combining, bidi, decomp,  (6)
+    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
+    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
+    #  properties] (17)
  
      def __init__(self, filename, exclusions, eastasianwidth, unihan,
-                 derivednormalizationprops=None, expand=1):
+                 derivednormalizationprops=None, linebreakprops=None,
+                 expand=1):
          self.changed = []
          file = open(filename)
          table = [None] * 0x110000
@@ -868,6 +883,23 @@ class UnicodeData:
          for i in range(0, 0x110000):
              if table[i] is not None:
                  table[i].append(widths[i])
+
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(set())
+        if linebreakprops:
+            for s in open(linebreakprops):
+                s = s.partition('#')[0]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                    continue
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    table[char][-1].add('Line_Break')
+
          if derivednormalizationprops:
              quickchecks = [0] * 0x110000 # default is Yes
              qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
author	Florent Xicluna <florent.xicluna@gmail.com>
	Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)
committer	Florent Xicluna <florent.xicluna@gmail.com>
	Tue, 30 Mar 2010 08:24:06 +0000 (08:24 +0000)
Lib/test/test_unicodedata.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history
Objects/unicodetype_db.h		patch \| blob \| history
Tools/unicode/makeunicodedata.py		patch \| blob \| history