From: Senthil Kumaran Date: Sun, 18 Jul 2010 02:27:10 +0000 (+0000) Subject: Fixing Issue1712522 - urllib.quote to support Unicode. The default X-Git-Tag: v2.7.1rc1~588 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5dba6dfe6a8224a54c3a4fd5cc2393a734718a77;p=python Fixing Issue1712522 - urllib.quote to support Unicode. The default encoding='utf-8' and errors='strict'. --- diff --git a/Doc/library/urllib.rst b/Doc/library/urllib.rst index 5621f51226..7b4d0bf2bc 100644 --- a/Doc/library/urllib.rst +++ b/Doc/library/urllib.rst @@ -202,24 +202,40 @@ High-level interface Utility functions ----------------- -.. function:: quote(string[, safe]) +.. function:: quote(string[, safe[, encoding[, errors]]]) Replace special characters in *string* using the ``%xx`` escape. Letters, digits, and the characters ``'_.-'`` are never quoted. By default, this - function is intended for quoting the path section of the URL.The optional + function is intended for quoting the path section of the URL. The optional *safe* parameter specifies additional characters that should not be quoted --- its default value is ``'/'``. + *string* may be either a :class:`str` or a :class:`unicode`. + + The optional *encoding* and *errors* parameters specify how to deal with + non-ASCII characters, as accepted by the :meth:`unicode.encode` method. + *encoding* defaults to ``'utf-8'``. + *errors* defaults to ``'strict'``, meaning unsupported characters raise a + :class:`UnicodeEncodeError`. + Non-Unicode strings are not encoded by default, and all bytes are allowed. + Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``. + Example: ``quote(u'/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``. + + .. versionchanged:: 2.7.1 + Added *encoding* and *errors* parameters. -.. function:: quote_plus(string[, safe]) + +.. function:: quote_plus(string[, safe[, encoding[, errors]]]) Like :func:`quote`, but also replaces spaces by plus signs, as required for quoting HTML form values when building up a query string to go into a URL. Plus signs in the original string are escaped unless they are included in *safe*. It also does not have *safe* default to ``'/'``. + Example: ``quote_plus(u'/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``. + .. function:: unquote(string) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 9bd8857371..77fa8f6e87 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -355,6 +355,38 @@ class QuotingTests(unittest.TestCase): self.assertEqual(quote_by_default, result, "using quote_plus(): %s != %s" % (quote_by_default, result)) + # Safe expressed as unicode rather than str + result = urllib.quote(quote_by_default, safe=u"<>") + self.assertEqual(quote_by_default, result, + "using quote(): %r != %r" % (quote_by_default, result)) + # "Safe" non-ASCII bytes should still work + # (Technically disallowed by the URI standard, but allowed for + # backwards compatibility with previous versions of Python) + result = urllib.quote(b"a\xfcb", safe=b"\xfc") + expect = b"a\xfcb" + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but with 'safe' as a unicode rather than str + # "Safe" non-ASCII unicode characters should have no effect + # (Since URIs are not allowed to have non-ASCII characters) + result = urllib.quote(b"a\xfcb", safe=u"\xfc") + expect = urllib.quote(b"a\xfcb", safe="") + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but quoting a unicode rather than a str + result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=b"\xfc") + expect = b"a\xfcb" + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but with both the quoted value and 'safe' as unicode + result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=u"\xfc") + expect = urllib.quote(u"a\xfcb", encoding="latin-1", safe="") + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) def test_default_quoting(self): # Make sure all characters that should be quoted are by default sans @@ -406,6 +438,81 @@ class QuotingTests(unittest.TestCase): 'alpha%2Bbeta+gamma') self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') + # Test with unicode + self.assertEqual(urllib.quote_plus(u'alpha+beta gamma'), + 'alpha%2Bbeta+gamma') + # Test with safe unicode + self.assertEqual(urllib.quote_plus('alpha+beta gamma', u'+'), + 'alpha+beta+gamma') + + def test_quote_bytes(self): + # Non-ASCII bytes should quote directly to percent-encoded values + given = b"\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Encoding argument should raise UnicodeDecodeError on bytes input + # with non-ASCII characters (just as with str.encode). + self.assertRaises(UnicodeDecodeError, urllib.quote, given, + encoding="latin-1") + + def test_quote_with_unicode(self): + # Characters in Latin-1 range, encoded by default in UTF-8 + given = u"\xa2\xd8ab\xff" + expect = "%C2%A2%C3%98ab%C3%BF" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded by with None (default) + result = urllib.quote(given, encoding=None, errors=None) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded with Latin-1 + given = u"\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.quote(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded by default in UTF-8 + given = u"\u6f22\u5b57" # "Kanji" + expect = "%E6%BC%A2%E5%AD%97" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded with Latin-1 + given = u"\u6f22\u5b57" + self.assertRaises(UnicodeEncodeError, urllib.quote, given, + encoding="latin-1") + # Characters in BMP, encoded with Latin-1, with replace error handling + given = u"\u6f22\u5b57" + expect = "%3F%3F" # "??" + result = urllib.quote(given, encoding="latin-1", + errors="replace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, Latin-1, with xmlcharref error handling + given = u"\u6f22\u5b57" + expect = "%26%2328450%3B%26%2323383%3B" # "漢字" + result = urllib.quote(given, encoding="latin-1", + errors="xmlcharrefreplace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + + def test_quote_plus_with_unicode(self): + # Encoding (latin-1) test for quote_plus + given = u"\xa2\xd8 \xff" + expect = "%A2%D8+%FF" + result = urllib.quote_plus(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote_plus(): %r != %r" % (expect, result)) + # Errors test for quote_plus + given = u"ab\u6f22\u5b57 cd" + expect = "ab%3F%3F+cd" + result = urllib.quote_plus(given, encoding="latin-1", + errors="replace") + self.assertEqual(expect, result, + "using quote_plus(): %r != %r" % (expect, result)) class UnquotingTests(unittest.TestCase): """Tests for unquote() and unquote_plus() diff --git a/Lib/urllib.py b/Lib/urllib.py index fb984216d7..e32a771933 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -1193,7 +1193,7 @@ for i, c in zip(xrange(256), str(bytearray(xrange(256)))): _safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i) _safe_quoters = {} -def quote(s, safe='/'): +def quote(s, safe='/', encoding=None, errors=None): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a @@ -1213,10 +1213,28 @@ def quote(s, safe='/'): is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. + + string and safe may be either str or unicode objects. + + The optional encoding and errors parameters specify how to deal with the + non-ASCII characters, as accepted by the unicode.encode method. + By default, encoding='utf-8' (characters are encoded with UTF-8), and + errors='strict' (unsupported characters raise a UnicodeEncodeError). """ # fastpath if not s: return s + + if encoding is not None or isinstance(s, unicode): + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'strict' + s = s.encode(encoding, errors) + if isinstance(safe, unicode): + # Normalize 'safe' by converting to str and removing non-ASCII chars + safe = safe.encode('ascii', 'ignore') + cachekey = (safe, always_safe) try: (quoter, safe) = _safe_quoters[cachekey] @@ -1230,12 +1248,12 @@ def quote(s, safe='/'): return s return ''.join(map(quoter, s)) -def quote_plus(s, safe=''): +def quote_plus(s, safe='', encoding=None, errors=None): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: - s = quote(s, safe + ' ') + s = quote(s, safe + ' ', encoding, errors) return s.replace(' ', '+') - return quote(s, safe) + return quote(s, safe, encoding, errors) def urlencode(query, doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. diff --git a/Misc/NEWS b/Misc/NEWS index d86566e09f..819a401172 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -18,6 +18,10 @@ Core and Builtins Library ------- +- Issue 1712522: urllib.quote supports Unicode String with encoding and errors + parameter. The encoding parameter defaults to utf-8 and errors to strict. + Patch by Matt Giuca. + - Issue #7646: The fnmatch pattern cache no longer grows without bound. - Issue #9136: Fix 'dictionary changed size during iteration'