From d3faf43f9ba7da0ae504c9186b10d0fa3a8eb300 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 18 Jan 2015 11:28:37 +0200 Subject: [PATCH] Issue #23181: More "codepoint" -> "code point". --- Doc/c-api/unicode.rst | 2 +- Doc/library/codecs.rst | 12 ++++++------ Doc/library/email.mime.rst | 2 +- Doc/library/functions.rst | 2 +- Doc/library/html.entities.rst | 4 ++-- Doc/library/json.rst | 2 +- Doc/tutorial/datastructures.rst | 2 +- Doc/whatsnew/3.3.rst | 12 ++++++------ Lib/codecs.py | 2 +- Lib/email/message.py | 2 +- Lib/html/entities.py | 4 ++-- Lib/test/multibytecodec_support.py | 2 +- Lib/test/test_html.py | 4 ++-- Lib/test/test_multibytecodec.py | 2 +- Lib/test/test_stringprep.py | 2 +- Lib/test/test_unicode.py | 4 ++-- Modules/cjkcodecs/_codecs_cn.c | 2 +- Modules/cjkcodecs/_codecs_hk.c | 2 +- Modules/cjkcodecs/_codecs_kr.c | 2 +- Modules/cjkcodecs/cjkcodecs.h | 4 ++-- Modules/unicodedata.c | 14 +++++++------- Objects/unicodeobject.c | 4 ++-- Python/sysmodule.c | 2 +- Tools/unicode/gencodec.py | 2 +- 24 files changed, 46 insertions(+), 46 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index f541b1c8de..03c284ad39 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1134,7 +1134,7 @@ These are the UTF-32 codec APIs: mark (U+FEFF). In the other two modes, no BOM mark is prepended. If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output - as a single codepoint. + as a single code point. Return *NULL* if an exception was raised by the codec. diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 8be5d18dc1..dd44cb27cc 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -827,7 +827,7 @@ methods and attributes from the underlying stream. Encodings and Unicode --------------------- -Strings are stored internally as sequences of codepoints in +Strings are stored internally as sequences of code points in range ``0x0``-``0x10FFFF``. (See :pep:`393` for more details about the implementation.) Once a string object is used outside of CPU and memory, endianness @@ -838,23 +838,23 @@ There are a variety of different text serialisation codecs, which are collectivity referred to as :term:`text encodings `. The simplest text encoding (called ``'latin-1'`` or ``'iso-8859-1'``) maps -the codepoints 0-255 to the bytes ``0x0``-``0xff``, which means that a string -object that contains codepoints above ``U+00FF`` can't be encoded with this +the code points 0-255 to the bytes ``0x0``-``0xff``, which means that a string +object that contains code points above ``U+00FF`` can't be encoded with this codec. Doing so will raise a :exc:`UnicodeEncodeError` that looks like the following (although the details of the error message may differ): ``UnicodeEncodeError: 'latin-1' codec can't encode character '\u1234' in position 3: ordinal not in range(256)``. There's another group of encodings (the so called charmap encodings) that choose -a different subset of all Unicode code points and how these codepoints are +a different subset of all Unicode code points and how these code points are mapped to the bytes ``0x0``-``0xff``. To see how this is done simply open e.g. :file:`encodings/cp1252.py` (which is an encoding that is used primarily on Windows). There's a string constant with 256 characters that shows you which character is mapped to which byte value. -All of these encodings can only encode 256 of the 1114112 codepoints +All of these encodings can only encode 256 of the 1114112 code points defined in Unicode. A simple and straightforward way that can store each Unicode -code point, is to store each codepoint as four consecutive bytes. There are two +code point, is to store each code point as four consecutive bytes. There are two possibilities: store the bytes in big endian or in little endian order. These two encodings are called ``UTF-32-BE`` and ``UTF-32-LE`` respectively. Their disadvantage is that if e.g. you use ``UTF-32-BE`` on a little endian machine you diff --git a/Doc/library/email.mime.rst b/Doc/library/email.mime.rst index 4cdb322f46..1d70225fc8 100644 --- a/Doc/library/email.mime.rst +++ b/Doc/library/email.mime.rst @@ -194,7 +194,7 @@ Here are the classes: minor type and defaults to :mimetype:`plain`. *_charset* is the character set of the text and is passed as an argument to the :class:`~email.mime.nonmultipart.MIMENonMultipart` constructor; it defaults - to ``us-ascii`` if the string contains only ``ascii`` codepoints, and + to ``us-ascii`` if the string contains only ``ascii`` code points, and ``utf-8`` otherwise. Unless the *_charset* argument is explicitly set to ``None``, the diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst index dad247d3ac..d9e5cfb5fc 100644 --- a/Doc/library/functions.rst +++ b/Doc/library/functions.rst @@ -156,7 +156,7 @@ are always available. They are listed here in alphabetical order. .. function:: chr(i) - Return the string representing a character whose Unicode codepoint is the integer + Return the string representing a character whose Unicode code point is the integer *i*. For example, ``chr(97)`` returns the string ``'a'``. This is the inverse of :func:`ord`. The valid range for the argument is from 0 through 1,114,111 (0x10FFFF in base 16). :exc:`ValueError` will be raised if *i* is diff --git a/Doc/library/html.entities.rst b/Doc/library/html.entities.rst index 09b0abc837..e10e46e2b8 100644 --- a/Doc/library/html.entities.rst +++ b/Doc/library/html.entities.rst @@ -33,12 +33,12 @@ This module defines four dictionaries, :data:`html5`, .. data:: name2codepoint - A dictionary that maps HTML entity names to the Unicode codepoints. + A dictionary that maps HTML entity names to the Unicode code points. .. data:: codepoint2name - A dictionary that maps Unicode codepoints to HTML entity names. + A dictionary that maps Unicode code points to HTML entity names. .. rubric:: Footnotes diff --git a/Doc/library/json.rst b/Doc/library/json.rst index 33ad102e6f..6f5f8b15a8 100644 --- a/Doc/library/json.rst +++ b/Doc/library/json.rst @@ -512,7 +512,7 @@ The RFC does not explicitly forbid JSON strings which contain byte sequences that don't correspond to valid Unicode characters (e.g. unpaired UTF-16 surrogates), but it does note that they may cause interoperability problems. By default, this module accepts and outputs (when present in the original -:class:`str`) codepoints for such sequences. +:class:`str`) code points for such sequences. Infinite and NaN Number Values diff --git a/Doc/tutorial/datastructures.rst b/Doc/tutorial/datastructures.rst index 5c3ae16980..1ea299fbed 100644 --- a/Doc/tutorial/datastructures.rst +++ b/Doc/tutorial/datastructures.rst @@ -684,7 +684,7 @@ the same type, the lexicographical comparison is carried out recursively. If all items of two sequences compare equal, the sequences are considered equal. If one sequence is an initial sub-sequence of the other, the shorter sequence is the smaller (lesser) one. Lexicographical ordering for strings uses the Unicode -codepoint number to order individual characters. Some examples of comparisons +code point number to order individual characters. Some examples of comparisons between sequences of the same type:: (1, 2, 3) < (1, 2, 4) diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 6a2c42555f..1fdb365847 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -228,7 +228,7 @@ Functionality Changes introduced by :pep:`393` are the following: -* Python now always supports the full range of Unicode codepoints, including +* Python now always supports the full range of Unicode code points, including non-BMP ones (i.e. from ``U+0000`` to ``U+10FFFF``). The distinction between narrow and wide builds no longer exists and Python now behaves like a wide build, even under Windows. @@ -246,7 +246,7 @@ Changes introduced by :pep:`393` are the following: so ``'\U0010FFFF'[0]`` now returns ``'\U0010FFFF'`` and not ``'\uDBFF'``; * all other functions in the standard library now correctly handle - non-BMP codepoints. + non-BMP code points. * The value of :data:`sys.maxunicode` is now always ``1114111`` (``0x10FFFF`` in hexadecimal). The :c:func:`PyUnicode_GetMax` function still returns @@ -258,13 +258,13 @@ Changes introduced by :pep:`393` are the following: Performance and resource usage ------------------------------ -The storage of Unicode strings now depends on the highest codepoint in the string: +The storage of Unicode strings now depends on the highest code point in the string: -* pure ASCII and Latin1 strings (``U+0000-U+00FF``) use 1 byte per codepoint; +* pure ASCII and Latin1 strings (``U+0000-U+00FF``) use 1 byte per code point; -* BMP strings (``U+0000-U+FFFF``) use 2 bytes per codepoint; +* BMP strings (``U+0000-U+FFFF``) use 2 bytes per code point; -* non-BMP strings (``U+10000-U+10FFFF``) use 4 bytes per codepoint. +* non-BMP strings (``U+10000-U+10FFFF``) use 4 bytes per code point. The net effect is that for most applications, memory usage of string storage should decrease significantly - especially compared to former diff --git a/Lib/codecs.py b/Lib/codecs.py index bca3ef3934..145bf120f7 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -123,7 +123,7 @@ class Codec: Python will use the official U+FFFD REPLACEMENT CHARACTER for the builtin Unicode codecs on decoding and '?' on encoding. - 'surrogateescape' - replace with private codepoints U+DCnn. + 'surrogateescape' - replace with private code points U+DCnn. 'xmlcharrefreplace' - Replace with the appropriate XML character reference (only for encoding). 'backslashreplace' - Replace with backslashed escape sequences diff --git a/Lib/email/message.py b/Lib/email/message.py index a179f8e374..2f37dbb892 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -273,7 +273,7 @@ class Message: bpayload = payload.encode('ascii') except UnicodeError: # This won't happen for RFC compliant messages (messages - # containing only ASCII codepoints in the unicode input). + # containing only ASCII code points in the unicode input). # If it does happen, turn the string into bytes in a way # guaranteed not to fail. bpayload = payload.encode('raw-unicode-escape') diff --git a/Lib/html/entities.py b/Lib/html/entities.py index e891ad6599..f7deae6eb0 100644 --- a/Lib/html/entities.py +++ b/Lib/html/entities.py @@ -1,6 +1,6 @@ """HTML character entity references.""" -# maps the HTML entity name to the Unicode codepoint +# maps the HTML entity name to the Unicode code point name2codepoint = { 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 @@ -2492,7 +2492,7 @@ html5 = { 'zwnj;': '\u200c', } -# maps the Unicode codepoint to the HTML entity name +# maps the Unicode code point to the HTML entity name codepoint2name = {} # maps the HTML entity name to the character diff --git a/Lib/test/multibytecodec_support.py b/Lib/test/multibytecodec_support.py index 51f5b541b3..bc1cfc8518 100644 --- a/Lib/test/multibytecodec_support.py +++ b/Lib/test/multibytecodec_support.py @@ -21,7 +21,7 @@ class TestBase: roundtriptest = 1 # set if roundtrip is possible with unicode has_iso10646 = 0 # set if this encoding contains whole iso10646 map xmlcharnametest = None # string to test xmlcharrefreplace - unmappedunicode = '\udeee' # a unicode codepoint that is not mapped. + unmappedunicode = '\udeee' # a unicode code point that is not mapped. def setUp(self): if self.codec is None: diff --git a/Lib/test/test_html.py b/Lib/test/test_html.py index 5e9f382a2d..d6f0ae857d 100644 --- a/Lib/test/test_html.py +++ b/Lib/test/test_html.py @@ -48,10 +48,10 @@ class HtmlTests(unittest.TestCase): check(s % num, char) for end in [' ', 'X']: check((s+end) % num, char+end) - # check invalid codepoints + # check invalid code points for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: check_num(cp, '\uFFFD') - # check more invalid codepoints + # check more invalid code points for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: check_num(cp, '') # check invalid numbers diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index ea592c7501..ce267ddeb3 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -80,7 +80,7 @@ class Test_IncrementalEncoder(unittest.TestCase): self.assertEqual(encoder.reset(), None) def test_stateful(self): - # jisx0213 encoder is stateful for a few codepoints. eg) + # jisx0213 encoder is stateful for a few code points. eg) # U+00E6 => A9DC # U+00E6 U+0300 => ABC4 # U+0300 => ABDC diff --git a/Lib/test/test_stringprep.py b/Lib/test/test_stringprep.py index aa7122172f..e763635efe 100644 --- a/Lib/test/test_stringprep.py +++ b/Lib/test/test_stringprep.py @@ -1,5 +1,5 @@ # To fully test this module, we would need a copy of the stringprep tables. -# Since we don't have them, this test checks only a few codepoints. +# Since we don't have them, this test checks only a few code points. import unittest from test import support diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 634bf93ee6..7735a6bbaa 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1441,9 +1441,9 @@ class UnicodeTest(string_tests.CommonTest, def test_utf8_decode_invalid_sequences(self): # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] - # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F + # start bytes of a 2-byte sequence equivalent to code points < 0x7F invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] - # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF + # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] invalid_start_bytes = ( continuation_bytes + invalid_2B_seq_start_bytes + diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index 013c3fb6b7..1a070f2f39 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -15,7 +15,7 @@ #undef hz #endif -/* GBK and GB2312 map differently in few codepoints that are listed below: +/* GBK and GB2312 map differently in few code points that are listed below: * * gb2312 gbk * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c index b7a7ebd6e1..4f21569a0c 100644 --- a/Modules/cjkcodecs/_codecs_hk.c +++ b/Modules/cjkcodecs/_codecs_hk.c @@ -171,7 +171,7 @@ DECODER(big5hkscs) default: return 1; } - NEXT_IN(2); /* all decoded codepoints are pairs, above. */ + NEXT_IN(2); /* all decoded code points are pairs, above. */ } return 0; diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c index 1ad41a7851..6d6acb5c4b 100644 --- a/Modules/cjkcodecs/_codecs_kr.c +++ b/Modules/cjkcodecs/_codecs_kr.c @@ -69,7 +69,7 @@ ENCODER(euc_kr) OUTBYTE1(EUCKR_JAMO_FIRSTBYTE); OUTBYTE2(EUCKR_JAMO_FILLER); - /* All codepoints in CP949 extension are in unicode + /* All code points in CP949 extension are in unicode * Hangul Syllable area. */ assert(0xac00 <= c && c <= 0xd7a3); c -= 0xac00; diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h index 25bab41cf3..d15ccfbb07 100644 --- a/Modules/cjkcodecs/cjkcodecs.h +++ b/Modules/cjkcodecs/cjkcodecs.h @@ -12,10 +12,10 @@ #include "multibytecodec.h" -/* a unicode "undefined" codepoint */ +/* a unicode "undefined" code point */ #define UNIINV 0xFFFE -/* internal-use DBCS codepoints which aren't used by any charsets */ +/* internal-use DBCS code points which aren't used by any charsets */ #define NOCHAR 0xFFFF #define MULTIC 0xFFFE #define DBCINV 0xFFFD diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 3979f65738..ec70e7af9d 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -976,7 +976,7 @@ is_unified_ideograph(Py_UCS4 code) (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } -/* macros used to determine if the given codepoint is in the PUA range that +/* macros used to determine if the given code point is in the PUA range that * we are using to store aliases and named sequences */ #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ @@ -986,7 +986,7 @@ static int _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) { - /* Find the name associated with the given codepoint. + /* Find the name associated with the given code point. * If with_alias_and_seq is 1, check for names in the Private Use Area 15 * that we are using for aliases and named sequences. */ int offset; @@ -997,7 +997,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, if (code >= 0x110000) return 0; - /* XXX should we just skip all the codepoints in the PUAs here? */ + /* XXX should we just skip all the code points in the PUAs here? */ if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) return 0; @@ -1125,8 +1125,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) /* check if named sequences are allowed */ if (!with_named_seq && IS_NAMED_SEQ(cp)) return 0; - /* if the codepoint is in the PUA range that we use for aliases, - * convert it to obtain the right codepoint */ + /* if the code point is in the PUA range that we use for aliases, + * convert it to obtain the right code point */ if (IS_ALIAS(cp)) *code = name_aliases[cp-aliases_start]; else @@ -1138,9 +1138,9 @@ static int _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, int with_named_seq) { - /* Return the codepoint associated with the given name. + /* Return the code point associated with the given name. * Named aliases are resolved too (unless self != NULL (i.e. we are using - * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are + * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are * using for the named sequence, and the caller must then convert it. */ unsigned int h, v; unsigned int mask = code_size-1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8a13f2cdd7..216cd6a3e5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5048,7 +5048,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, } if (Py_UNICODE_IS_SURROGATE(ch)) { - errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)"; + errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; startinpos = ((const char *)q) - starts; endinpos = startinpos + 4; } @@ -5067,7 +5067,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, q += 4; continue; } - errmsg = "codepoint not in range(0x110000)"; + errmsg = "code point not in range(0x110000)"; startinpos = ((const char *)q) - starts; endinpos = startinpos + 4; } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 106fc84fa9..290eec1199 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -1368,7 +1368,7 @@ hexversion -- version information encoded as a single integer\n\ implementation -- Python implementation information.\n\ int_info -- a struct sequence with information about the int implementation.\n\ maxsize -- the largest supported length of containers.\n\ -maxunicode -- the value of the largest Unicode codepoint\n\ +maxunicode -- the value of the largest Unicode code point\n\ platform -- platform identifier\n\ prefix -- prefix used to find the Python library\n\ thread_info -- a struct sequence with information about the thread implementation.\n\ diff --git a/Tools/unicode/gencodec.py b/Tools/unicode/gencodec.py index f4c7c038c4..98b39758f9 100644 --- a/Tools/unicode/gencodec.py +++ b/Tools/unicode/gencodec.py @@ -34,7 +34,7 @@ MAX_TABLE_SIZE = 8192 # Standard undefined Unicode code point UNI_UNDEFINED = chr(0xFFFE) -# Placeholder for a missing codepoint +# Placeholder for a missing code point MISSING_CODE = -1 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' -- 2.40.0