From: Marc-André Lemburg Date: Fri, 16 May 2003 17:07:51 +0000 (+0000) Subject: Remove usage of re module from encodings package search function. X-Git-Tag: v2.3c1~701 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=282012593510a285fec5b8b5e42b04fef3ffffe0;p=python Remove usage of re module from encodings package search function. --- diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 66bea5c068..666afad6b6 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -27,12 +27,17 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). """#" -import codecs, exceptions, re +import codecs, exceptions, types _cache = {} _unknown = '--unknown--' _import_tail = ['*'] -_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]') +_norm_encoding_map = (' . ' + '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + ' abcdefghijklmnopqrstuvwxyz ' + ' ' + ' ' + ' ') class CodecRegistryError(exceptions.LookupError, exceptions.SystemError): @@ -45,10 +50,20 @@ def normalize_encoding(encoding): Normalization works as follows: all non-alphanumeric characters except the dot used for Python package names are collapsed and replaced with a single underscore, e.g. ' -;#' - becomes '_'. + becomes '_'. Leading and trailing underscores are removed. + + Note that encoding names should be ASCII only; if they do use + non-ASCII characters, these must be Latin-1 compatible. """ - return '_'.join(_norm_encoding_RE.split(encoding)) + # Make sure we have an 8-bit string, because .translate() works + # differently for Unicode strings. + if type(encoding) is types.UnicodeType: + # Note that .encode('latin-1') does *not* use the codec + # registry, so this call doesn't recurse. (See unicodeobject.c + # PyUnicode_AsEncodedString() for details) + encoding = encoding.encode('latin-1') + return '_'.join(encoding.translate(_norm_encoding_map).split()) def search_function(encoding):