#10713: Improve documentation for \b and \B and add a few tests. Initial patch and...

author Ezio Melotti <ezio.melotti@gmail.com>

Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index b196a28f9d76104e4a1d1b72e3f3576ed70629b4..ac07cf8451c0f89106264a55955f9b8f86a2f334 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -330,16 +330,22 @@ the second character.  For example, ``\$`` matches the character ``'$'``.
     Matches the empty string, but only at the beginning or end of a word.
     A word is defined as a sequence of Unicode alphanumeric or underscore
     characters, so the end of a word is indicated by whitespace or a
-   non-alphanumeric, non-underscore Unicode character. Note that
-   formally, ``\b`` is defined as the boundary between a ``\w`` and a
-   ``\W`` character (or vice versa). By default Unicode alphanumerics
-   are the ones used, but this can be changed by using the :const:`ASCII`
-   flag.  Inside a character range, ``\b`` represents the backspace
-   character, for compatibility with Python's string literals.
+   non-alphanumeric, non-underscore Unicode character.  Note that formally,
+   ``\b`` is defined as the boundary between a ``\w`` and a ``\W`` character
+   (or vice versa), or between ``\w`` and the beginning/end of the string.
+   This means that ``r'\bfoo\b'`` matches ``'foo'``, ``'foo.'``, ``'(foo)'``,
+   ``'bar foo baz'`` but not ``'foobar'`` or ``'foo3'``.
+
+   By default Unicode alphanumerics are the ones used, but this can  be changed
+   by using the :const:`ASCII` flag.  Inside a character range, ``\b``
+   represents the backspace character, for compatibility with Python's string
+   literals.
  
  ``\B``
-   Matches the empty string, but only when it is *not* at the beginning or end of a
-   word.  This is just the opposite of ``\b``, so word characters are
+   Matches the empty string, but only when it is *not* at the beginning or end
+   of a word.  This means that ``r'py\B'`` matches ``'python'``, ``'py3'``,
+   ``'py2'``, but not ``'py'``, ``'py.'``, or ``'py!'``.
+   ``\B`` is just the opposite of ``\b``, so word characters are
     Unicode alphanumerics or the underscore, although this can be changed
     by using the :const:`ASCII` flag.
  
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index fe8bc3403947981ac9618fba2f28b2b979b2ab42..0f39eaddde69b6b0e7c609632b4727a7ed1bf615 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -355,6 +355,32 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                     "1aa! a", re.UNICODE).group(0), "1aa! a")
  
+    def test_string_boundaries(self):
+        # See http://bugs.python.org/issue10713
+        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
+                         "abc")
+        # There's a word boundary at the start of a string.
+        self.assertTrue(re.match(r"\b", "abc"))
+        # A non-empty string includes a non-boundary zero-length match.
+        self.assertTrue(re.search(r"\B", "abc"))
+        # There is no non-boundary match at the start of a string.
+        self.assertFalse(re.match(r"\B", "abc"))
+        # However, an empty string contains no word boundaries, and also no
+        # non-boundaries.
+        self.assertEqual(re.search(r"\B", ""), None)
+        # This one is questionable and different from the perlre behaviour,
+        # but describes current behavior.
+        self.assertEqual(re.search(r"\b", ""), None)
+        # A single word-character string has two boundaries, but no
+        # non-boundary gaps.
+        self.assertEqual(len(re.findall(r"\b", "a")), 2)
+        self.assertEqual(len(re.findall(r"\B", "a")), 0)
+        # If there are no words, there are no boundaries
+        self.assertEqual(len(re.findall(r"\b", " ")), 0)
+        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
+        # Can match around the whitespace.
+        self.assertEqual(len(re.findall(r"\B", " ")), 2)
+
      def test_bigcharset(self):
          self.assertEqual(re.match("([\u2222\u2223])",
                                    "\u2222").group(1), "\u2222")
author	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 29 Feb 2012 09:48:44 +0000 (11:48 +0200)
Doc/library/re.rst		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history