From: Martin v. Löwis Date: Fri, 29 May 2009 16:22:26 +0000 (+0000) Subject: Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion X-Git-Tag: v3.1rc1~9 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8ed91b2768137b35dd5cd45f25fa96ad53ba1066;p=python Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion of the command line. --- diff --git a/Misc/NEWS b/Misc/NEWS index b98a368848..1e1277327a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 3.1 release candidate 1? Core and Builtins ----------------- +- Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion + of the command line. + - Issue #6012: Add cleanup support to O& argument parsing. - Issue #6089: Fixed str.format with certain invalid field specifiers diff --git a/Modules/python.c b/Modules/python.c index 13c6d5b82a..edd33f433a 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -38,8 +38,16 @@ char2wchar(char* arg) if (!res) goto oom; count = mbstowcs(res, arg, argsize+1); - if (count != (size_t)-1) - return res; + if (count != (size_t)-1) { + wchar_t *tmp; + /* Only use the result if it contains no + surrogate characters. */ + for (tmp = res; *tmp != 0 && + (*tmp < 0xd800 || *tmp > 0xdfff); tmp++) + ; + if (*tmp == 0) + return res; + } PyMem_Free(res); } /* Conversion failed. Fall back to escaping with surrogateescape. */ @@ -75,6 +83,14 @@ char2wchar(char* arg) memset(&mbs, 0, sizeof mbs); continue; } + if (*out >= 0xd800 && *out <= 0xdfff) { + /* Surrogate character. Escape the original + byte sequence with surrogateescape. */ + argsize -= converted; + while (converted--) + *out++ = 0xdc00 + *in++; + continue; + } /* successfully converted some bytes */ in += converted; argsize -= converted;