Remove usage of re module from encodings package search function.

author Marc-André Lemburg <mal@egenix.com>

Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)

committer Marc-André Lemburg <mal@egenix.com>

Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)
author Marc-André Lemburg <mal@egenix.com>
Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)
committer Marc-André Lemburg <mal@egenix.com>
Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py

index 66bea5c06846d2586bc709ba9a5aa9580d61669b..666afad6b6ec7a5a0f546ea5157e762eb26bda18 100644 (file)
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -27,12 +27,17 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
  
  """#"
  
-import codecs, exceptions, re
+import codecs, exceptions, types
  
  _cache = {}
  _unknown = '--unknown--'
  _import_tail = ['*']
-_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]')
+_norm_encoding_map = ('                                              . '
+                      '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
+                      ' abcdefghijklmnopqrstuvwxyz                     '
+                      '                                                '
+                      '                                                '
+                      '                ')
  
  class CodecRegistryError(exceptions.LookupError,
                           exceptions.SystemError):
@@ -45,10 +50,20 @@ def normalize_encoding(encoding):
          Normalization works as follows: all non-alphanumeric
          characters except the dot used for Python package names are
          collapsed and replaced with a single underscore, e.g. '  -;#'
-        becomes '_'.
+        becomes '_'. Leading and trailing underscores are removed.
+
+        Note that encoding names should be ASCII only; if they do use
+        non-ASCII characters, these must be Latin-1 compatible.
  
      """
-    return '_'.join(_norm_encoding_RE.split(encoding))
+    # Make sure we have an 8-bit string, because .translate() works
+    # differently for Unicode strings.
+    if type(encoding) is types.UnicodeType:
+        # Note that .encode('latin-1') does *not* use the codec
+        # registry, so this call doesn't recurse. (See unicodeobject.c
+        # PyUnicode_AsEncodedString() for details)
+        encoding = encoding.encode('latin-1')
+    return '_'.join(encoding.translate(_norm_encoding_map).split())
  
  def search_function(encoding):
author	Marc-André Lemburg <mal@egenix.com>
	Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)
committer	Marc-André Lemburg <mal@egenix.com>
	Fri, 16 May 2003 17:07:51 +0000 (17:07 +0000)