From 8c1c426a631ba02357112657193f82c58d3e08b4 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Mon, 19 Aug 2019 02:53:22 -0700 Subject: [PATCH] bpo-36502: Correct documentation of str.isspace() (GH-15019) (GH-15296) The documented definition was much broader than the real one: there are tons of characters with general category "Other", and we don't (and shouldn't) treat most of them as whitespace. Rewrite the definition to agree with the comment on _PyUnicode_IsWhitespace, and with the logic in makeunicodedata.py, which is what generates that function and so ultimately governs. Add suitable breadcrumbs so that a reader who wants to pin down exactly what this definition means (what's a "bidirectional class" of "B"?) can do so. The `unicodedata` module documentation is an appropriate central place for our references to Unicode's own copious documentation, so point there. Also add to the isspace() test a thorough check that the implementation agrees with the intended definition. --- Doc/library/stdtypes.rst | 10 +++++++--- Lib/test/test_unicode.py | 13 ++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 965167640c..0f7c369ea5 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1756,9 +1756,13 @@ expression support in the :mod:`re` module). .. method:: str.isspace() Return true if there are only whitespace characters in the string and there is - at least one character, false otherwise. Whitespace characters are those - characters defined in the Unicode character database as "Other" or "Separator" - and those with bidirectional property being one of "WS", "B", or "S". + at least one character, false otherwise. + + A character is *whitespace* if in the Unicode character database + (see :mod:`unicodedata`), either its general category is ``Zs`` + ("Separator, space"), or its bidirectional class is one of ``WS``, + ``B``, or ``S``. + .. method:: str.istitle() diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 36b72e40c7..1d6aabdbbc 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -11,6 +11,7 @@ import itertools import operator import struct import sys +import unicodedata import unittest import warnings from test import support, string_tests @@ -615,11 +616,21 @@ class UnicodeTest(string_tests.CommonTest, self.checkequalnofix(True, '\u2000', 'isspace') self.checkequalnofix(True, '\u200a', 'isspace') self.checkequalnofix(False, '\u2014', 'isspace') - # apparently there are no non-BMP spaces chars in Unicode 6 + # There are no non-BMP whitespace chars as of Unicode 12. for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) + @support.requires_resource('cpu') + def test_isspace_invariant(self): + for codepoint in range(sys.maxunicode + 1): + char = chr(codepoint) + bidirectional = unicodedata.bidirectional(char) + category = unicodedata.category(char) + self.assertEqual(char.isspace(), + (bidirectional in ('WS', 'B', 'S') + or category == 'Zs')) + def test_isalnum(self): super().test_isalnum() for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', -- 2.40.0