That way, separator components are always found at the same relative
indices within the result list.
- The pattern can match empty strings. ::
+ Empty matches for the pattern split the string only when not adjacent
+ to a previous empty match.
>>> re.split(r'\b', 'Words, words, words.')
['', 'Words', ', ', 'words', ', ', 'words', '.']
+ >>> re.split(r'\W*', '...words...')
+ ['', '', 'w', 'o', 'r', 'd', 's', '', '']
>>> re.split(r'(\W*)', '...words...')
- ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '']
+ ['', '...', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '', '', '']
.. versionchanged:: 3.1
Added the optional flags argument.
The optional argument *count* is the maximum number of pattern occurrences to be
replaced; *count* must be a non-negative integer. If omitted or zero, all
occurrences will be replaced. Empty matches for the pattern are replaced only
- when not adjacent to a previous match, so ``sub('x*', '-', 'abc')`` returns
- ``'-a-b-c-'``.
+ when not adjacent to a previous empty match, so ``sub('x*', '-', 'abxd')`` returns
+ ``'-a-b--d-'``.
In string-type *repl* arguments, in addition to the character escapes and
backreferences described above,
Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter
now are errors.
+ Empty matches for the pattern are replaced when adjacent to a previous
+ non-empty match.
+
.. function:: subn(pattern, repl, string, count=0, flags=0)
* The result of splitting a string on a :mod:`regular expression <re>`
that could match an empty string has been changed. For example
splitting on ``r'\s*'`` will now split not only on whitespaces as it
- did previously, but also between any pair of non-whitespace
- characters. The previous behavior can be restored by changing the pattern
+ did previously, but also on empty strings before all non-whitespace
+ characters and just before the end of the string.
+ The previous behavior can be restored by changing the pattern
to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since
Python 3.5.
positions 2--3. To match only blank lines, the pattern should be rewritten
as ``r'(?m)^[^\S\n]*$'``.
- (Contributed by Serhiy Storchaka in :issue:`25054`.)
+ :func:`re.sub()` now replaces empty matches adjacent to a previous
+ non-empty match. For example ``re.sub('x*', '-', 'abxd')`` returns now
+ ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between 'b' and
+ 'd' replaces 'x', and the second minus replaces an empty string between
+ 'x' and 'd').
+
+ (Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.)
* :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
recent to be more consistent with :mod:`traceback`.
self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
'hello there')
- def test_bug_462270(self):
- # Test for empty sub() behaviour, see SF bug #462270
- self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
- self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
-
def test_symbolic_groups(self):
re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
['', 'a', '', '', 'c'])
for sep, expected in [
- (':*', ['', 'a', 'b', 'c', '']),
- ('(?::*)', ['', 'a', 'b', 'c', '']),
- ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
- ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
+ (':*', ['', '', 'a', '', 'b', '', 'c', '']),
+ ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
+ ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
+ ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
]:
with self.subTest(sep=sep):
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
- ['', ':', 'a', ':', 'b::c'])
+ ['', ':', '', '', 'a:b::c'])
def test_re_findall(self):
self.assertEqual(re.findall(":+", "abc"), [])
def test_zerowidth(self):
# Issues 852532, 1647489, 3262, 25054.
self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
- self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
- self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
+ self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
+ self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
- self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
- self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
+ self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
+ self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
self.assertEqual(re.findall(r"\b|\w+", "a::bc"),