bpo-28180: Implementation for PEP 538 (#659)

author Nick Coghlan <ncoghlan@gmail.com>

Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)

committer GitHub <noreply@github.com>

Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)
author Nick Coghlan <ncoghlan@gmail.com>
Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)
committer GitHub <noreply@github.com>
Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst

index 40a06b9adc06ef165badce57912e1b27261eac33..920d5c01e4bef44456165c833aa3f5492dac7287 100644 (file)
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -713,6 +713,42 @@ conflict.
  
     .. versionadded:: 3.6
  
+
+.. envvar:: PYTHONCOERCECLOCALE
+
+   If set to the value ``0``, causes the main Python command line application
+   to skip coercing the legacy ASCII-based C locale to a more capable UTF-8
+   based alternative. Note that this setting is checked even when the
+   :option:`-E` or :option:`-I` options are used, as it is handled prior to
+   the processing of command line options.
+
+   If this variable is *not* set, or is set to a value other than ``0``, and
+   the current locale reported for the ``LC_CTYPE`` category is the default
+   ``C`` locale, then the Python CLI will attempt to configure the following
+   locales for the ``LC_CTYPE`` category in the order listed before loading the
+   interpreter runtime:
+
+   * ``C.UTF-8``
+   * ``C.utf8``
+   * ``UTF-8``
+
+   If setting one of these locale categories succeeds, then the ``LC_CTYPE``
+   environment variable will also be set accordingly in the current process
+   environment before the Python runtime is initialized. This ensures the
+   updated setting is seen in subprocesses, as well as in operations that
+   query the environment rather than the current C locale (such as Python's
+   own :func:`locale.getdefaultlocale`).
+
+   Configuring one of these locales (either explicitly or via the above
+   implicit locale coercion) will automatically set the error handler for
+   :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This
+   behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual.
+
+   Availability: \*nix
+
+   .. versionadded:: 3.7
+      See :pep:`538` for more details.
+
  Debug-mode variables
  ~~~~~~~~~~~~~~~~~~~~
  
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst

index 3df30935cae6d55c2a3e8f7f26961e1dd265cedf..6074781010399fcd626db2f311c4e53a7a8930e0 100644 (file)
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -70,6 +70,51 @@ Summary -- Release highlights
  New Features
  ============
  
+.. _whatsnew37-pep538:
+
+PEP 538: Legacy C Locale Coercion
+---------------------------------
+
+An ongoing challenge within the Python 3 series has been determining a sensible
+default strategy for handling the "7-bit ASCII" text encoding assumption
+currently implied by the use of the default C locale on non-Windows platforms.
+
+:pep:`538` updates the default interpreter command line interface to
+automatically coerce that locale to an available UTF-8 based locale as
+described in the documentation of the new :envvar:`PYTHONCOERCECLOCALE`
+environment variable. Automatically setting ``LC_CTYPE`` this way means that
+both the core interpreter and locale-aware C extensions (such as
+:mod:`readline`) will assume the use of UTF-8 as the default text encoding,
+rather than ASCII.
+
+The platform support definition in :pep:`11` has also been updated to limit
+full text handling support to suitably configured non-ASCII based locales.
+
+As part of this change, the default error handler for ``stdin`` and ``stdout``
+is now ``surrogateescape`` (rather than ``strict``) when using any of the
+defined coercion target locales (currently ``C.UTF-8``, ``C.utf8``, and
+``UTF-8``). The default error handler for ``stderr`` continues to be
+``backslashreplace``, regardless of locale.
+
+.. note::
+
+   In the current implementation, a warning message is printed directly to
+   ``stderr`` even for successful implicit locale coercion. This gives
+   redistributors and system integrators the opportunity to determine if they
+   should be making an environmental change to avoid the need for implicit
+   coercion at the Python interpreter level.
+
+   However, it's not clear that this is going to be the best approach for
+   the final 3.7.0 release, and we may end up deciding to disable the warning
+   by default and provide some way of opting into it at runtime or build time.
+
+   Concrete examples of use cases where it would be preferrable to disable the
+   warning by default can be noted on :issue:`30565`.
+
+.. seealso::
+
+    :pep:`538` -- Coercing the legacy C locale to a UTF-8 based locale
+       PEP written and implemented by Nick Coghlan.
  
  
  Other Language Changes
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py

index 1e746472ee63489b382be84578f2e1e1ce89ebec..b3ac848f08252b83ca3738b12aebc0a42fe142b1 100644 (file)
--- a/Lib/test/support/script_helper.py
+++ b/Lib/test/support/script_helper.py
@@ -48,8 +48,35 @@ def interpreter_requires_environment():
      return __cached_interp_requires_environment
  
  
-_PythonRunResult = collections.namedtuple("_PythonRunResult",
-                                          ("rc", "out", "err"))
+class _PythonRunResult(collections.namedtuple("_PythonRunResult",
+                                          ("rc", "out", "err"))):
+    """Helper for reporting Python subprocess run results"""
+    def fail(self, cmd_line):
+        """Provide helpful details about failed subcommand runs"""
+        # Limit to 80 lines to ASCII characters
+        maxlen = 80 * 100
+        out, err = self.out, self.err
+        if len(out) > maxlen:
+            out = b'(... truncated stdout ...)' + out[-maxlen:]
+        if len(err) > maxlen:
+            err = b'(... truncated stderr ...)' + err[-maxlen:]
+        out = out.decode('ascii', 'replace').rstrip()
+        err = err.decode('ascii', 'replace').rstrip()
+        raise AssertionError("Process return code is %d\n"
+                             "command line: %r\n"
+                             "\n"
+                             "stdout:\n"
+                             "---\n"
+                             "%s\n"
+                             "---\n"
+                             "\n"
+                             "stderr:\n"
+                             "---\n"
+                             "%s\n"
+                             "---"
+                             % (self.rc, cmd_line,
+                                out,
+                                err))
  
  
  # Executing the interpreter in a subprocess
@@ -107,30 +134,7 @@ def run_python_until_end(*args, **env_vars):
  def _assert_python(expected_success, *args, **env_vars):
      res, cmd_line = run_python_until_end(*args, **env_vars)
      if (res.rc and expected_success) or (not res.rc and not expected_success):
-        # Limit to 80 lines to ASCII characters
-        maxlen = 80 * 100
-        out, err = res.out, res.err
-        if len(out) > maxlen:
-            out = b'(... truncated stdout ...)' + out[-maxlen:]
-        if len(err) > maxlen:
-            err = b'(... truncated stderr ...)' + err[-maxlen:]
-        out = out.decode('ascii', 'replace').rstrip()
-        err = err.decode('ascii', 'replace').rstrip()
-        raise AssertionError("Process return code is %d\n"
-                             "command line: %r\n"
-                             "\n"
-                             "stdout:\n"
-                             "---\n"
-                             "%s\n"
-                             "---\n"
-                             "\n"
-                             "stderr:\n"
-                             "---\n"
-                             "%s\n"
-                             "---"
-                             % (res.rc, cmd_line,
-                                out,
-                                err))
+        res.fail(cmd_line)
      return res
  
  def assert_python_ok(*args, **env_vars):
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py

new file mode 100644 (file)

index 0000000..c14d820
--- /dev/null
+++ b/Lib/test/test_c_locale_coercion.py
@@ -0,0 +1,262 @@
+# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
+
+import unittest
+import os
+import sys
+import sysconfig
+import shutil
+import subprocess
+from collections import namedtuple
+
+import test.support
+from test.support.script_helper import (
+    run_python_until_end,
+    interpreter_requires_environment,
+)
+
+# In order to get the warning messages to match up as expected, the candidate
+# order here must much the target locale order in Python/pylifecycle.c
+_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
+
+# There's no reliable cross-platform way of checking locale alias
+# lists, so the only way of knowing which of these locales will work
+# is to try them with locale.setlocale(). We do that in a subprocess
+# to avoid altering the locale of the test runner.
+def _set_locale_in_subprocess(locale_name):
+    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
+    cmd = cmd_fmt.format(locale_name)
+    result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
+    return result.rc == 0
+
+_EncodingDetails = namedtuple("EncodingDetails",
+                              "fsencoding stdin_info stdout_info stderr_info")
+
+class EncodingDetails(_EncodingDetails):
+    CHILD_PROCESS_SCRIPT = ";".join([
+        "import sys",
+        "print(sys.getfilesystemencoding())",
+        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
+        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
+        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
+    ])
+
+    @classmethod
+    def get_expected_details(cls, expected_fsencoding):
+        """Returns expected child process details for a given encoding"""
+        _stream = expected_fsencoding + ":{}"
+        # stdin and stdout should use surrogateescape either because the
+        # coercion triggered, or because the C locale was detected
+        stream_info = 2*[_stream.format("surrogateescape")]
+        # stderr should always use backslashreplace
+        stream_info.append(_stream.format("backslashreplace"))
+        return dict(cls(expected_fsencoding, *stream_info)._asdict())
+
+    @staticmethod
+    def _handle_output_variations(data):
+        """Adjust the output to handle platform specific idiosyncrasies
+
+        * Some platforms report ASCII as ANSI_X3.4-1968
+        * Some platforms report ASCII as US-ASCII
+        * Some platforms report UTF-8 instead of utf-8
+        """
+        data = data.replace(b"ANSI_X3.4-1968", b"ascii")
+        data = data.replace(b"US-ASCII", b"ascii")
+        data = data.lower()
+        return data
+
+    @classmethod
+    def get_child_details(cls, env_vars):
+        """Retrieves fsencoding and standard stream details from a child process
+
+        Returns (encoding_details, stderr_lines):
+
+        - encoding_details: EncodingDetails for eager decoding
+        - stderr_lines: result of calling splitlines() on the stderr output
+
+        The child is run in isolated mode if the current interpreter supports
+        that.
+        """
+        result, py_cmd = run_python_until_end(
+            "-c", cls.CHILD_PROCESS_SCRIPT,
+            __isolated=True,
+            **env_vars
+        )
+        if not result.rc == 0:
+            result.fail(py_cmd)
+        # All subprocess outputs in this test case should be pure ASCII
+        adjusted_output = cls._handle_output_variations(result.out)
+        stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines()
+        child_encoding_details = dict(cls(*stdout_lines)._asdict())
+        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
+        return child_encoding_details, stderr_lines
+
+
+class _ChildProcessEncodingTestCase(unittest.TestCase):
+    # Base class to check for expected encoding details in a child process
+
+    def _check_child_encoding_details(self,
+                                      env_vars,
+                                      expected_fsencoding,
+                                      expected_warning):
+        """Check the C locale handling for the given process environment
+
+        Parameters:
+            expected_fsencoding: the encoding the child is expected to report
+            allow_c_locale: setting to use for PYTHONALLOWCLOCALE
+              None: don't set the variable at all
+              str: the value set in the child's environment
+        """
+        result = EncodingDetails.get_child_details(env_vars)
+        encoding_details, stderr_lines = result
+        self.assertEqual(encoding_details,
+                         EncodingDetails.get_expected_details(
+                             expected_fsencoding))
+        self.assertEqual(stderr_lines, expected_warning)
+
+# Details of the shared library warning emitted at runtime
+LIBRARY_C_LOCALE_WARNING = (
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+    "locales is recommended."
+)
+
+@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"),
+                     "C locale runtime warning disabled at build time")
+class LocaleWarningTests(_ChildProcessEncodingTestCase):
+    # Test warning emitted when running in the C locale
+
+    def test_library_c_locale_warning(self):
+        self.maxDiff = None
+        for locale_to_set in ("C", "POSIX", "invalid.ascii"):
+            var_dict = {
+                "LC_ALL": locale_to_set
+            }
+            with self.subTest(forced_locale=locale_to_set):
+                self._check_child_encoding_details(var_dict,
+                                                   "ascii",
+                                                   [LIBRARY_C_LOCALE_WARNING])
+
+# Details of the CLI locale coercion warning emitted at runtime
+CLI_COERCION_WARNING_FMT = (
+    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
+)
+
+class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase):
+    # Base class for test cases that rely on coercion targets being defined
+
+    available_targets = []
+    targets_required = True
+
+    @classmethod
+    def setUpClass(cls):
+        first_target_locale = None
+        available_targets = cls.available_targets
+        # Find the target locales available in the current system
+        for target_locale in _C_UTF8_LOCALES:
+            if _set_locale_in_subprocess(target_locale):
+                available_targets.append(target_locale)
+                if first_target_locale is None:
+                    first_target_locale = target_locale
+        if cls.targets_required and not available_targets:
+            raise unittest.SkipTest("No C-with-UTF-8 locale available")
+        # Expect coercion to use the first available locale
+        warning_msg = CLI_COERCION_WARNING_FMT.format(first_target_locale)
+        cls.EXPECTED_COERCION_WARNING = warning_msg
+
+
+class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
+    # Test explicit external configuration via the process environment
+
+    def test_external_target_locale_configuration(self):
+        # Explicitly setting a target locale should give the same behaviour as
+        # is seen when implicitly coercing to that target locale
+        self.maxDiff = None
+
+        expected_warning = []
+        expected_fsencoding = "utf-8"
+
+        base_var_dict = {
+            "LANG": "",
+            "LC_CTYPE": "",
+            "LC_ALL": "",
+        }
+        for env_var in ("LANG", "LC_CTYPE"):
+            for locale_to_set in self.available_targets:
+                with self.subTest(env_var=env_var,
+                                  configured_locale=locale_to_set):
+                    var_dict = base_var_dict.copy()
+                    var_dict[env_var] = locale_to_set
+                    self._check_child_encoding_details(var_dict,
+                                                       expected_fsencoding,
+                                                       expected_warning)
+
+
+
+@test.support.cpython_only
+@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
+                     "C locale coercion disabled at build time")
+class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
+    # Test implicit reconfiguration of the environment during CLI startup
+
+    def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale):
+        """Check the C locale handling for various configurations
+
+        Parameters:
+            expected_fsencoding: the encoding the child is expected to report
+            allow_c_locale: setting to use for PYTHONALLOWCLOCALE
+              None: don't set the variable at all
+              str: the value set in the child's environment
+        """
+
+        # Check for expected warning on stderr if C locale is coerced
+        self.maxDiff = None
+
+        expected_warning = []
+        if coerce_c_locale != "0":
+            expected_warning.append(self.EXPECTED_COERCION_WARNING)
+
+        base_var_dict = {
+            "LANG": "",
+            "LC_CTYPE": "",
+            "LC_ALL": "",
+        }
+        for env_var in ("LANG", "LC_CTYPE"):
+            for locale_to_set in ("", "C", "POSIX", "invalid.ascii"):
+                with self.subTest(env_var=env_var,
+                                  nominal_locale=locale_to_set,
+                                  PYTHONCOERCECLOCALE=coerce_c_locale):
+                    var_dict = base_var_dict.copy()
+                    var_dict[env_var] = locale_to_set
+                    if coerce_c_locale is not None:
+                        var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
+                    self._check_child_encoding_details(var_dict,
+                                                       expected_fsencoding,
+                                                       expected_warning)
+
+    def test_test_PYTHONCOERCECLOCALE_not_set(self):
+        # This should coerce to the first available target locale by default
+        self._check_c_locale_coercion("utf-8", coerce_c_locale=None)
+
+    def test_PYTHONCOERCECLOCALE_not_zero(self):
+        # *Any* string other that "0" is considered "set" for our purposes
+        # and hence should result in the locale coercion being enabled
+        for setting in ("", "1", "true", "false"):
+            self._check_c_locale_coercion("utf-8", coerce_c_locale=setting)
+
+    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
+        # The setting "0" should result in the locale coercion being disabled
+        self._check_c_locale_coercion("ascii", coerce_c_locale="0")
+
+
+def test_main():
+    test.support.run_unittest(
+        LocaleConfigurationTests,
+        LocaleCoercionTests,
+        LocaleWarningTests
+    )
+    test.support.reap_children()
+
+if __name__ == "__main__":
+    test_main()
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py

index 8ac8af9826569597c638c777d3ba9ee602f54954..c4a976642909497eb90d34bd78d59d85d765b83a 100644 (file)
--- a/Lib/test/test_capi.py
+++ b/Lib/test/test_capi.py
@@ -371,14 +371,21 @@ class EmbeddingTests(unittest.TestCase):
      def tearDown(self):
          os.chdir(self.oldcwd)
  
-    def run_embedded_interpreter(self, *args):
+    def run_embedded_interpreter(self, *args, env=None):
          """Runs a test in the embedded interpreter"""
          cmd = [self.test_exe]
          cmd.extend(args)
+        if env is not None and sys.platform == 'win32':
+            # Windows requires at least the SYSTEMROOT environment variable to
+            # start Python.
+            env = env.copy()
+            env['SYSTEMROOT'] = os.environ['SYSTEMROOT']
+
          p = subprocess.Popen(cmd,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
-                             universal_newlines=True)
+                             universal_newlines=True,
+                             env=env)
          (out, err) = p.communicate()
          self.assertEqual(p.returncode, 0,
                           "bad returncode %d, stderr is %r" %
@@ -471,26 +478,16 @@ class EmbeddingTests(unittest.TestCase):
                  self.assertNotEqual(sub.tstate, main.tstate)
                  self.assertNotEqual(sub.modules, main.modules)
  
-    @staticmethod
-    def _get_default_pipe_encoding():
-        rp, wp = os.pipe()
-        try:
-            with os.fdopen(wp, 'w') as w:
-                default_pipe_encoding = w.encoding
-        finally:
-            os.close(rp)
-        return default_pipe_encoding
-
      def test_forced_io_encoding(self):
          # Checks forced configuration of embedded interpreter IO streams
-        out, err = self.run_embedded_interpreter("forced_io_encoding")
-        if support.verbose:
+        env = {"PYTHONIOENCODING": "utf-8:surrogateescape"}
+        out, err = self.run_embedded_interpreter("forced_io_encoding", env=env)
+        if support.verbose > 1:
              print()
              print(out)
              print(err)
-        expected_errors = sys.__stdout__.errors
-        expected_stdin_encoding = sys.__stdin__.encoding
-        expected_pipe_encoding = self._get_default_pipe_encoding()
+        expected_stream_encoding = "utf-8"
+        expected_errors = "surrogateescape"
          expected_output = '\n'.join([
          "--- Use defaults ---",
          "Expected encoding: default",
@@ -517,8 +514,8 @@ class EmbeddingTests(unittest.TestCase):
          "stdout: latin-1:replace",
          "stderr: latin-1:backslashreplace"])
          expected_output = expected_output.format(
-                                in_encoding=expected_stdin_encoding,
-                                out_encoding=expected_pipe_encoding,
+                                in_encoding=expected_stream_encoding,
+                                out_encoding=expected_stream_encoding,
                                  errors=expected_errors)
          # This is useful if we ever trip over odd platform behaviour
          self.maxDiff = None
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py

index 5f96d61c7d0ab7b89c42648dde967d5c55526c7a..3e20427917345a52d3664c9e4f95829212c2ae69 100644 (file)
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -8,8 +8,9 @@ import sys
  import subprocess
  import tempfile
  from test.support import script_helper, is_android
-from test.support.script_helper import (spawn_python, kill_python, assert_python_ok,
-    assert_python_failure)
+from test.support.script_helper import (
+    spawn_python, kill_python, assert_python_ok, assert_python_failure
+)
  
  
  # XXX (ncoghlan): Move to script_helper and make consistent with run_python
@@ -150,6 +151,7 @@ class CmdLineTest(unittest.TestCase):
          env = os.environ.copy()
          # Use C locale to get ascii for the locale encoding
          env['LC_ALL'] = 'C'
+        env['PYTHONCOERCECLOCALE'] = '0'
          code = (
              b'import locale; '
              b'print(ascii("' + undecodable + b'"), '
diff --git a/Lib/test/test_subprocess.py b/Lib/test/test_subprocess.py

index 7fabe6ad7653326ca19e081a3dc580f658f3d85c..52b05c12b641da4ff884758a46f83cc9f65e4c23 100644 (file)
--- a/Lib/test/test_subprocess.py
+++ b/Lib/test/test_subprocess.py
@@ -642,7 +642,8 @@ class ProcessTestCase(BaseTestCase):
              # on adding even when the environment in exec is empty.
              # Gentoo sandboxes also force LD_PRELOAD and SANDBOX_* to exist.
              return ('VERSIONER' in n or '__CF' in n or  # MacOS
-                    n == 'LD_PRELOAD' or n.startswith('SANDBOX'))  # Gentoo
+                    n == 'LD_PRELOAD' or n.startswith('SANDBOX') or # Gentoo
+                    n == 'LC_CTYPE') # Locale coercion triggered
  
          with subprocess.Popen([sys.executable, "-c",
                                 'import os; print(list(os.environ.keys()))'],
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py

index ed78e2a0f9aea3fd82abee8646594fd644b5df9f..3844812ba288733ac35b8462debe2eb82df632d6 100644 (file)
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -682,6 +682,7 @@ class SysModuleTest(unittest.TestCase):
          # Force the POSIX locale
          env = os.environ.copy()
          env["LC_ALL"] = "C"
+        env["PYTHONCOERCECLOCALE"] = "0"
          code = '\n'.join((
              'import sys',
              'def dump(name):',
diff --git a/Misc/NEWS b/Misc/NEWS

index e58de824566254c456b5fc3914468ace569bf06b..8cbd4632889f7f211a3013194dae3f52aa227957 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,11 @@ What's New in Python 3.7.0 alpha 1?
  Core and Builtins
  -----------------
  
+- bpo-28180: Implement PEP 538 (legacy C locale coercion). This means that when
+  a suitable coercion target locale is available, both the core interpreter and
+  locale-aware C extensions will assume the use of UTF-8 as the default text
+  encoding, rather than ASCII.
+
  - bpo-30486: Allows setting cell values for __closure__. Patch by Lisa Roach.
  
  - bpo-30537: itertools.islice now accepts integer-like objects (having
diff --git a/Programs/python.c b/Programs/python.c

index a7afbc774b3a55b01bec9ed41eed1a376a2c4232..03f8295045cfc65609ea742d5f8dcc3cefa1e44c 100644 (file)
--- a/Programs/python.c
+++ b/Programs/python.c
@@ -15,6 +15,21 @@ wmain(int argc, wchar_t **argv)
  }
  #else
  
+/* Access private pylifecycle helper API to better handle the legacy C locale
+ *
+ * The legacy C locale assumes ASCII as the default text encoding, which
+ * causes problems not only for the CPython runtime, but also other
+ * components like GNU readline.
+ *
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
+ * more capable UTF-8 based alternative.
+ *
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
+ *
+ */
+extern int _Py_LegacyLocaleDetected(void);
+extern void _Py_CoerceLegacyLocale(void);
+
  int
  main(int argc, char **argv)
  {
@@ -25,7 +40,11 @@ main(int argc, char **argv)
      char *oldloc;
  
      /* Force malloc() allocator to bootstrap Python */
+#ifdef Py_DEBUG
+    (void)_PyMem_SetupAllocators("malloc_debug");
+#  else
      (void)_PyMem_SetupAllocators("malloc");
+#  endif
  
      argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
      argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
@@ -49,7 +68,21 @@ main(int argc, char **argv)
          return 1;
      }
  
+#ifdef __ANDROID__
+    /* Passing "" to setlocale() on Android requests the C locale rather
+     * than checking environment variables, so request C.UTF-8 explicitly
+     */
+    setlocale(LC_ALL, "C.UTF-8");
+#else
+    /* Reconfigure the locale to the default for this process */
      setlocale(LC_ALL, "");
+#endif
+
+    if (_Py_LegacyLocaleDetected()) {
+        _Py_CoerceLegacyLocale();
+    }
+
+    /* Convert from char to wchar_t based on the locale settings */
      for (i = 0; i < argc; i++) {
          argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
          if (!argv_copy[i]) {
@@ -70,7 +103,11 @@ main(int argc, char **argv)
  
      /* Force again malloc() allocator to release memory blocks allocated
         before Py_Main() */
+#ifdef Py_DEBUG
+    (void)_PyMem_SetupAllocators("malloc_debug");
+#  else
      (void)_PyMem_SetupAllocators("malloc");
+#  endif
  
      for (i = 0; i < argc; i++) {
          PyMem_RawFree(argv_copy2[i]);
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c

index ec26824f839eeed5509a6a73445de55f9283f8a0..b7c98225641176fba43d125bcb2ea5de384faaee 100644 (file)
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -178,6 +178,7 @@ Py_SetStandardStreamEncoding(const char *encoding, const char *errors)
      return 0;
  }
  
+
  /* Global initializations.  Can be undone by Py_FinalizeEx().  Don't
     call this twice without an intervening Py_FinalizeEx() call.  When
     initializations fail, a fatal error is issued and the function does
@@ -330,6 +331,159 @@ initexternalimport(PyInterpreterState *interp)
      Py_DECREF(value);
  }
  
+/* Helper functions to better handle the legacy C locale
+ *
+ * The legacy C locale assumes ASCII as the default text encoding, which
+ * causes problems not only for the CPython runtime, but also other
+ * components like GNU readline.
+ *
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
+ * more capable UTF-8 based alternative as follows:
+ *
+ *     if (_Py_LegacyLocaleDetected()) {
+ *         _Py_CoerceLegacyLocale();
+ *     }
+ *
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
+ *
+ * Locale coercion also impacts the default error handler for the standard
+ * streams: while the usual default is "strict", the default for the legacy
+ * C locale and for any of the coercion target locales is "surrogateescape".
+ */
+
+int
+_Py_LegacyLocaleDetected(void)
+{
+#ifndef MS_WINDOWS
+    /* On non-Windows systems, the C locale is considered a legacy locale */
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+    return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
+#else
+    /* Windows uses code pages instead of locales, so no locale is legacy */
+    return 0;
+#endif
+}
+
+typedef struct _CandidateLocale {
+    const char *locale_name; /* The locale to try as a coercion target */
+} _LocaleCoercionTarget;
+
+static _LocaleCoercionTarget _TARGET_LOCALES[] = {
+    {"C.UTF-8"},
+    {"C.utf8"},
+    {"UTF-8"},
+    {NULL}
+};
+
+static char *
+get_default_standard_stream_error_handler(void)
+{
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+    if (ctype_loc != NULL) {
+        /* "surrogateescape" is the default in the legacy C locale */
+        if (strcmp(ctype_loc, "C") == 0) {
+            return "surrogateescape";
+        }
+
+#ifdef PY_COERCE_C_LOCALE
+        /* "surrogateescape" is the default in locale coercion target locales */
+        const _LocaleCoercionTarget *target = NULL;
+        for (target = _TARGET_LOCALES; target->locale_name; target++) {
+            if (strcmp(ctype_loc, target->locale_name) == 0) {
+                return "surrogateescape";
+            }
+        }
+#endif
+   }
+
+   /* Otherwise return NULL to request the typical default error handler */
+   return NULL;
+}
+
+#ifdef PY_COERCE_C_LOCALE
+static const char *_C_LOCALE_COERCION_WARNING =
+    "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale "
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
+
+static void
+_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
+{
+    const char *newloc = target->locale_name;
+
+    /* Reset locale back to currently configured defaults */
+    setlocale(LC_ALL, "");
+
+    /* Set the relevant locale environment variable */
+    if (setenv("LC_CTYPE", newloc, 1)) {
+        fprintf(stderr,
+                "Error setting LC_CTYPE, skipping C locale coercion\n");
+        return;
+    }
+    fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
+
+    /* Reconfigure with the overridden environment variables */
+    setlocale(LC_ALL, "");
+}
+#endif
+
+void
+_Py_CoerceLegacyLocale(void)
+{
+#ifdef PY_COERCE_C_LOCALE
+    /* We ignore the Python -E and -I flags here, as the CLI needs to sort out
+     * the locale settings *before* we try to do anything with the command
+     * line arguments. For cross-platform debugging purposes, we also need
+     * to give end users a way to force even scripts that are otherwise
+     * isolated from their environment to use the legacy ASCII-centric C
+     * locale.
+     *
+     * Ignoring -E and -I is safe from a security perspective, as we only use
+     * the setting to turn *off* the implicit locale coercion, and anyone with
+     * access to the process environment already has the ability to set
+     * `LC_ALL=C` to override the C level locale settings anyway.
+     */
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+    if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+        /* PYTHONCOERCECLOCALE is not set, or is set to something other than "0" */
+        const char *locale_override = getenv("LC_ALL");
+        if (locale_override == NULL || *locale_override == '\0') {
+            /* LC_ALL is also not set (or is set to an empty string) */
+            const _LocaleCoercionTarget *target = NULL;
+            for (target = _TARGET_LOCALES; target->locale_name; target++) {
+                const char *new_locale = setlocale(LC_CTYPE,
+                                                   target->locale_name);
+                if (new_locale != NULL) {
+                    /* Successfully configured locale, so make it the default */
+                    _coerce_default_locale_settings(target);
+                    return;
+                }
+            }
+        }
+    }
+    /* No C locale warning here, as Py_Initialize will emit one later */
+#endif
+}
+
+
+#ifdef PY_WARN_ON_C_LOCALE
+static const char *_C_LOCALE_WARNING =
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+    "locales is recommended.\n";
+
+static void
+_emit_stderr_warning_for_c_locale(void)
+{
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+    if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+        if (_Py_LegacyLocaleDetected()) {
+            fprintf(stderr, "%s", _C_LOCALE_WARNING);
+        }
+    }
+}
+#endif
+
  
  /* Global initializations.  Can be undone by Py_Finalize().  Don't
     call this twice without an intervening Py_Finalize() call.
@@ -396,11 +550,21 @@ void _Py_InitializeCore(const _PyCoreConfig *config)
       */
      _Py_Finalizing = NULL;
  
-#ifdef HAVE_SETLOCALE
+#ifdef __ANDROID__
+    /* Passing "" to setlocale() on Android requests the C locale rather
+     * than checking environment variables, so request C.UTF-8 explicitly
+     */
+    setlocale(LC_CTYPE, "C.UTF-8");
+#else
+#ifndef MS_WINDOWS
      /* Set up the LC_CTYPE locale, so we can obtain
         the locale's charset without having to switch
         locales. */
      setlocale(LC_CTYPE, "");
+#ifdef PY_WARN_ON_C_LOCALE
+    _emit_stderr_warning_for_c_locale();
+#endif
+#endif
  #endif
  
      if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0')
@@ -1457,12 +1621,8 @@ initstdio(void)
              }
          }
          if (!errors && !(pythonioencoding && *pythonioencoding)) {
-            /* When the LC_CTYPE locale is the POSIX locale ("C locale"),
-               stdin and stdout use the surrogateescape error handler by
-               default, instead of the strict error handler. */
-            char *loc = setlocale(LC_CTYPE, NULL);
-            if (loc != NULL && strcmp(loc, "C") == 0)
-                errors = "surrogateescape";
+            /* Choose the default error handler based on the current locale */
+            errors = get_default_standard_stream_error_handler();
          }
      }
  
diff --git a/configure b/configure

index 8b824b24365113bc3d306c8696c65a39c2143f61..ec42e08f8961c161782c83674973e470d2b383b7 100755 (executable)
--- a/configure
+++ b/configure
@@ -834,6 +834,8 @@ with_thread
  enable_ipv6
  with_doc_strings
  with_pymalloc
+with_c_locale_coercion
+with_c_locale_warning
  with_valgrind
  with_dtrace
  with_fpectl
@@ -1528,6 +1530,12 @@ Optional Packages:
                            deprecated; use --with(out)-threads
    --with(out)-doc-strings disable/enable documentation strings
    --with(out)-pymalloc    disable/enable specialized mallocs
+  --with(out)-c-locale-coercion
+                          disable/enable C locale coercion to a UTF-8 based
+                          locale
+  --with(out)-c-locale-warning
+                          disable/enable locale compatibility warning in the C
+                          locale
    --with-valgrind         Enable Valgrind support
    --with(out)-dtrace      disable/enable DTrace support
    --with-fpectl           enable SIGFPE catching
@@ -11047,6 +11055,52 @@ fi
  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5
  $as_echo "$with_pymalloc" >&6; }
  
+# Check for --with-c-locale-coercion
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5
+$as_echo_n "checking for --with-c-locale-coercion... " >&6; }
+
+# Check whether --with-c-locale-coercion was given.
+if test "${with_c_locale_coercion+set}" = set; then :
+  withval=$with_c_locale_coercion;
+fi
+
+
+if test -z "$with_c_locale_coercion"
+then
+    with_c_locale_coercion="yes"
+fi
+if test "$with_c_locale_coercion" != "no"
+then
+
+$as_echo "#define PY_COERCE_C_LOCALE 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_coercion" >&5
+$as_echo "$with_c_locale_coercion" >&6; }
+
+# Check for --with-c-locale-warning
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-warning" >&5
+$as_echo_n "checking for --with-c-locale-warning... " >&6; }
+
+# Check whether --with-c-locale-warning was given.
+if test "${with_c_locale_warning+set}" = set; then :
+  withval=$with_c_locale_warning;
+fi
+
+
+if test -z "$with_c_locale_warning"
+then
+    with_c_locale_warning="yes"
+fi
+if test "$with_c_locale_warning" != "no"
+then
+
+$as_echo "#define PY_WARN_ON_C_LOCALE 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_warning" >&5
+$as_echo "$with_c_locale_warning" >&6; }
+
  # Check for Valgrind support
  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-valgrind" >&5
  $as_echo_n "checking for --with-valgrind... " >&6; }
diff --git a/configure.ac b/configure.ac

index d5b6399cc04014a69e354626185e73c8dce6a36e..18b940ab3291726943088ac3e02b5b4e334f6e1b 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -3325,6 +3325,40 @@ then
  fi
  AC_MSG_RESULT($with_pymalloc)
  
+# Check for --with-c-locale-coercion
+AC_MSG_CHECKING(for --with-c-locale-coercion)
+AC_ARG_WITH(c-locale-coercion,
+            AS_HELP_STRING([--with(out)-c-locale-coercion],
+              [disable/enable C locale coercion to a UTF-8 based locale]))
+
+if test -z "$with_c_locale_coercion"
+then
+    with_c_locale_coercion="yes"
+fi
+if test "$with_c_locale_coercion" != "no"
+then
+    AC_DEFINE(PY_COERCE_C_LOCALE, 1,
+      [Define if you want to coerce the C locale to a UTF-8 based locale])
+fi
+AC_MSG_RESULT($with_c_locale_coercion)
+
+# Check for --with-c-locale-warning
+AC_MSG_CHECKING(for --with-c-locale-warning)
+AC_ARG_WITH(c-locale-warning,
+            AS_HELP_STRING([--with(out)-c-locale-warning],
+              [disable/enable locale compatibility warning in the C locale]))
+
+if test -z "$with_c_locale_warning"
+then
+    with_c_locale_warning="yes"
+fi
+if test "$with_c_locale_warning" != "no"
+then
+    AC_DEFINE(PY_WARN_ON_C_LOCALE, 1,
+      [Define to emit a locale compatibility warning in the C locale])
+fi
+AC_MSG_RESULT($with_c_locale_warning)
+
  # Check for Valgrind support
  AC_MSG_CHECKING([for --with-valgrind])
  AC_ARG_WITH([valgrind],
diff --git a/pyconfig.h.in b/pyconfig.h.in

index 0a3d59ef9ae602fa02405c7b7068efaf7b55b21b..fa2792b18ad419ec6adda99056cf2e89fec392fb 100644 (file)
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1247,9 +1247,15 @@
  /* Define as the preferred size in bits of long digits */
  #undef PYLONG_BITS_IN_DIGIT
  
+/* Define if you want to coerce the C locale to a UTF-8 based locale */
+#undef PY_COERCE_C_LOCALE
+
  /* Define to printf format modifier for Py_ssize_t */
  #undef PY_FORMAT_SIZE_T
  
+/* Define to emit a locale compatibility warning in the C locale */
+#undef PY_WARN_ON_C_LOCALE
+
  /* Define if you want to build an interpreter with many run-time checks. */
  #undef Py_DEBUG
author	Nick Coghlan <ncoghlan@gmail.com>
	Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)
committer	GitHub <noreply@github.com>
	Sun, 11 Jun 2017 03:16:15 +0000 (13:16 +1000)
Doc/using/cmdline.rst		patch \| blob \| history
Doc/whatsnew/3.7.rst		patch \| blob \| history
Lib/test/support/script_helper.py		patch \| blob \| history
Lib/test/test_c_locale_coercion.py	[new file with mode: 0644]	patch \| blob
Lib/test/test_capi.py		patch \| blob \| history
Lib/test/test_cmd_line.py		patch \| blob \| history
Lib/test/test_subprocess.py		patch \| blob \| history
Lib/test/test_sys.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Programs/python.c		patch \| blob \| history
Python/pylifecycle.c		patch \| blob \| history
configure		patch \| blob \| history
configure.ac		patch \| blob \| history
pyconfig.h.in		patch \| blob \| history