Issue #6561: '\d' in a regular expression should match only Unicode

author Mark Dickinson <dickinsm@gmail.com>

Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)

committer Mark Dickinson <dickinsm@gmail.com>

Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)
author Mark Dickinson <dickinsm@gmail.com>
Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)
committer Mark Dickinson <dickinsm@gmail.com>
Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index 3b2f70ba8ab2ba64489e744947c2ed8fa9885bb2..cdb9951dc83cea405010446896a96f9b303f2603 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -338,11 +338,12 @@ the second character.  For example, ``\$`` matches the character ``'$'``.
  
  ``\d``
     For Unicode (str) patterns:
-      Matches any Unicode digit (which includes ``[0-9]``, and also many
-      other digit characters). If the :const:`ASCII` flag is used only
-      ``[0-9]`` is matched (but the flag affects the entire regular
-      expression, so in such cases using an explicit ``[0-9]`` may be a
-      better choice).
+      Matches any Unicode decimal digit (that is, any character in
+      Unicode character category [Nd]).  This includes ``[0-9]``, and
+      also many other digit characters.  If the :const:`ASCII` flag is
+      used only ``[0-9]`` is matched (but the flag affects the entire
+      regular expression, so in such cases using an explicit ``[0-9]``
+      may be a better choice).
     For 8-bit (bytes) patterns:
        Matches any decimal digit; this is equivalent to ``[0-9]``.
  
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 383b56ac7ab615d56b21c9717f7535be18983a57..8b4d26899303c13442b3d680e0f456784d48e55d 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -605,6 +605,27 @@ class ReTests(unittest.TestCase):
          self.assertEqual(next(iter).span(), (4, 4))
          self.assertRaises(StopIteration, next, iter)
  
+    def test_bug_6561(self):
+        # '\d' should match characters in Unicode category 'Nd'
+        # (Number, Decimal Digit), but not those in 'Nl' (Number,
+        # Letter) or 'No' (Number, Other).
+        decimal_digits = [
+            '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
+            '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
+            '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+            ]
+        for x in decimal_digits:
+            self.assertEqual(re.match('^\d$', x).group(0), x)
+
+        not_decimal_digits = [
+            '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+            '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+            '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
+            '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+            ]
+        for x in not_decimal_digits:
+            self.assertIsNone(re.match('^\d$', x))
+
      def test_empty_array(self):
          # SF buf 1647541
          import array
diff --git a/Misc/NEWS b/Misc/NEWS

index 83aabb3afccf5d50398dbe1766b550a8d0a69c26..aa4cfd906d02a9138ddc27026514d8a6e645505f 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -108,6 +108,10 @@ Library
  Extension Modules
  -----------------
  
+- Issue #6561: '\d' in a regex now matches only characters with
+  Unicode category 'Nd' (Number, Decimal Digit).  Previously it also
+  matched characters with category 'No'.
+
  - Issue #4509: Array objects are no longer modified after an operation
    failing due to the resize restriction in-place when the object has exported
    buffers.
diff --git a/Modules/_sre.c b/Modules/_sre.c

index 45b92f319d6c5f94ce7ae5bfa1dab45854a313aa..596fd19dfdb573218313f5d8e01706a2055d1c48 100644 (file)
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -168,7 +168,7 @@ static unsigned int sre_lower_locale(unsigned int ch)
  
  #if defined(HAVE_UNICODE)
  
-#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
+#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
  #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
  #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
  #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
author	Mark Dickinson <dickinsm@gmail.com>
	Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)
committer	Mark Dickinson <dickinsm@gmail.com>
	Tue, 28 Jul 2009 17:22:36 +0000 (17:22 +0000)
Doc/library/re.rst		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Modules/_sre.c		patch \| blob \| history