bpo-32308: Replace empty matches adjacent to a previous non-empty match in re.sub...

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)

committer GitHub <noreply@github.com>

Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)
committer GitHub <noreply@github.com>
Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)
diff --git a/Doc/howto/regex.rst b/Doc/howto/regex.rst

index fa8c6939408100aec5d6b4826bbe5e99184df36f..87a6b1aba59f9f2e4d9fc99f7b7915f8dde01239 100644 (file)
--- a/Doc/howto/regex.rst
+++ b/Doc/howto/regex.rst
@@ -1140,12 +1140,12 @@ new string value and the number of replacements  that were performed::
     >>> p.subn('colour', 'no colours at all')
     ('no colours at all', 0)
  
-Empty matches are replaced only when they're not adjacent to a previous match.
+Empty matches are replaced only when they're not adjacent to a previous empty match.
  ::
  
     >>> p = re.compile('x*')
     >>> p.sub('-', 'abxd')
-   '-a-b-d-'
+   '-a-b--d-'
  
  If *replacement* is a string, any backslash escapes in it are processed.  That
  is, ``\n`` is converted to a single newline character, ``\r`` is converted to a
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index dae1d7ea10a031398145c990f380e64ad7b213f1..9b175f4e96756b6851877602aaeb6637964dea5c 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -708,12 +708,15 @@ form.
     That way, separator components are always found at the same relative
     indices within the result list.
  
-   The pattern can match empty strings. ::
+   Empty matches for the pattern split the string only when not adjacent
+   to a previous empty match.
  
        >>> re.split(r'\b', 'Words, words, words.')
        ['', 'Words', ', ', 'words', ', ', 'words', '.']
+      >>> re.split(r'\W*', '...words...')
+      ['', '', 'w', 'o', 'r', 'd', 's', '', '']
        >>> re.split(r'(\W*)', '...words...')
-      ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '']
+      ['', '...', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '', '', '']
  
     .. versionchanged:: 3.1
        Added the optional flags argument.
@@ -778,8 +781,8 @@ form.
     The optional argument *count* is the maximum number of pattern occurrences to be
     replaced; *count* must be a non-negative integer.  If omitted or zero, all
     occurrences will be replaced. Empty matches for the pattern are replaced only
-   when not adjacent to a previous match, so ``sub('x*', '-', 'abc')`` returns
-   ``'-a-b-c-'``.
+   when not adjacent to a previous empty match, so ``sub('x*', '-', 'abxd')`` returns
+   ``'-a-b--d-'``.
  
     In string-type *repl* arguments, in addition to the character escapes and
     backreferences described above,
@@ -805,6 +808,9 @@ form.
        Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter
        now are errors.
  
+      Empty matches for the pattern are replaced when adjacent to a previous
+      non-empty match.
+
  
  .. function:: subn(pattern, repl, string, count=0, flags=0)
  
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst

index 1924881219a2790ae1373492d045ca23b385043a..1311e9e2016a43ab9793b50d746e58c692f2ba94 100644 (file)
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -881,8 +881,9 @@ Changes in the Python API
  * The result of splitting a string on a :mod:`regular expression <re>`
    that could match an empty string has been changed.  For example
    splitting on ``r'\s*'`` will now split not only on whitespaces as it
-  did previously, but also between any pair of non-whitespace
-  characters.  The previous behavior can be restored by changing the pattern
+  did previously, but also on empty strings before all non-whitespace
+  characters and just before the end of the string.
+  The previous behavior can be restored by changing the pattern
    to ``r'\s+'``.  A :exc:`FutureWarning` was emitted for such patterns since
    Python 3.5.
  
@@ -893,7 +894,13 @@ Changes in the Python API
    positions 2--3.  To match only blank lines, the pattern should be rewritten
    as ``r'(?m)^[^\S\n]*$'``.
  
-  (Contributed by Serhiy Storchaka in :issue:`25054`.)
+  :func:`re.sub()` now replaces empty matches adjacent to a previous
+  non-empty match.  For example ``re.sub('x*', '-', 'abxd')`` returns now
+  ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between 'b' and
+  'd' replaces 'x', and the second minus replaces an empty string between
+  'x' and 'd').
+
+  (Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.)
  
  * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
    recent to be more consistent with :mod:`traceback`.
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index aaed3d893aaf94d25000ca5a455a6dd881da76c4..9fed4bef8809fcc2029a76842ec10277a4f01266 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -213,11 +213,6 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
                           'hello there')
  
-    def test_bug_462270(self):
-        # Test for empty sub() behaviour, see SF bug #462270
-        self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
-        self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
-
      def test_symbolic_groups(self):
          re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
          re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
@@ -331,10 +326,10 @@ class ReTests(unittest.TestCase):
                           ['', 'a', '', '', 'c'])
  
          for sep, expected in [
-            (':*', ['', 'a', 'b', 'c', '']),
-            ('(?::*)', ['', 'a', 'b', 'c', '']),
-            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
-            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
+            (':*', ['', '', 'a', '', 'b', '', 'c', '']),
+            ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
+            ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
+            ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
          ]:
              with self.subTest(sep=sep):
                  self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
@@ -357,7 +352,7 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
                           ['', ':', 'a', ':', 'b::c'])
          self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
-                         ['', ':', 'a', ':', 'b::c'])
+                         ['', ':', '', '', 'a:b::c'])
  
      def test_re_findall(self):
          self.assertEqual(re.findall(":+", "abc"), [])
@@ -1753,13 +1748,13 @@ class ReTests(unittest.TestCase):
      def test_zerowidth(self):
          # Issues 852532, 1647489, 3262, 25054.
          self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
-        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
-        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
+        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
+        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
          self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
  
          self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
-        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
-        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
+        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
+        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
  
          self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
          self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
diff --git a/Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst b/Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst

new file mode 100644 (file)

index 0000000..d760d60
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst
@@ -0,0 +1,2 @@
+:func:`re.sub()` now replaces empty matches adjacent to a previous non-empty
+match.
diff --git a/Modules/_sre.c b/Modules/_sre.c

index 68fc523c251b38375cc989efacfc8a85980e38d3..b6be6f6ffa6063c766458297d3f344d4d386ff01 100644 (file)
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -955,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
          }
  
          n = n + 1;
-        state.must_advance = 1;
+        state.must_advance = (state.ptr == state.start);
          last = state.start = state.ptr;
  
      }
@@ -1109,7 +1109,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
  
          i = e;
          n = n + 1;
-        state.must_advance = 1;
+        state.must_advance = (state.ptr == state.start);
          state.start = state.ptr;
      }
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)
committer	GitHub <noreply@github.com>
	Thu, 4 Jan 2018 09:06:13 +0000 (11:06 +0200)
Doc/howto/regex.rst		patch \| blob \| history
Doc/library/re.rst		patch \| blob \| history
Doc/whatsnew/3.7.rst		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst	[new file with mode: 0644]	patch \| blob
Modules/_sre.c		patch \| blob \| history