Additional test and documentation for the unicode() changes.

author Marc-André Lemburg <mal@egenix.com>

Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)

committer Marc-André Lemburg <mal@egenix.com>

Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)
author Marc-André Lemburg <mal@egenix.com>
Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)
committer Marc-André Lemburg <mal@egenix.com>
Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)
diff --git a/Doc/lib/libfuncs.tex b/Doc/lib/libfuncs.tex

index 080876105b7bcb5d422dfc4c71e55051f3f43456..b19d4a643a2adc6e4f08f1c08ec2fb8ee376b8c9 100644 (file)
--- a/Doc/lib/libfuncs.tex
+++ b/Doc/lib/libfuncs.tex
@@ -758,19 +758,33 @@ def my_import(name):
    \versionadded{2.0}
  \end{funcdesc}
  
-\begin{funcdesc}{unicode}{string\optional{, encoding\optional{, errors}}}
-  Create a Unicode string from an 8-bit string \var{string} using the
-  codec for \var{encoding}.  The \var{encoding} parameter is a string
-  giving the name of an encoding.  Error handling is done according to
-  \var{errors}; this specifies the treatment of characters which are
-  invalid in the input encoding.  If \var{errors} is \code{'strict'}
-  (the default), a \exception{ValueError} is raised on errors, while a
-  value of \code{'ignore'} causes errors to be silently ignored, and a
-  value of \code{'replace'} causes the official Unicode replacement
-  character, \code{U+FFFD}, to be used to replace input characters
-  which cannot be decoded.  The default behavior is to decode UTF-8 in
-  strict mode, meaning that encoding errors raise
-  \exception{ValueError}.  See also the \refmodule{codecs} module.
+\begin{funcdesc}{unicode}{object\optional{, encoding\optional{, errors}}}
+  Return the Unicode string version of \var{object} using one of the
+  following modes:
+
+  If \var{encoding} and/or \var{errors} are given, \code{unicode()}
+  will decode the object which can either be an 8-bit string or a
+  character buffer using the codec for \var{encoding}. The
+  \var{encoding} parameter is a string giving the name of an encoding.
+  Error handling is done according to \var{errors}; this specifies the
+  treatment of characters which are invalid in the input encoding.  If
+  \var{errors} is \code{'strict'} (the default), a
+  \exception{ValueError} is raised on errors, while a value of
+  \code{'ignore'} causes errors to be silently ignored, and a value of
+  \code{'replace'} causes the official Unicode replacement character,
+  \code{U+FFFD}, to be used to replace input characters which cannot
+  be decoded.  See also the \refmodule{codecs} module.
+
+  If no optional parameters are given, \code{unicode()} will mimic the
+  behaviour of \code{str()} except that it returns Unicode strings
+  instead of 8-bit strings. More precisely, if \var{object} is an
+  Unicode string or subclass it will return a Unicode string without
+  any additional decoding applied. For objects which provide a
+  \code{__unicode__} method, it will call this method without
+  arguments to create a Unicode string. For all other objects, the
+  8-bit string version or representation is requested and then
+  converted to a Unicode string using the codec for the default
+  encoding in \code{'strict'} mode.
    \versionadded{2.0}
  \end{funcdesc}
  
diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode

index 783a4860ab4353a4d5e6b0ca7cd62c2f76d0280a..82ed240075159cdd08094883f050b3b0545741d5 100644 (file)
--- a/Lib/test/output/test_unicode
+++ b/Lib/test/output/test_unicode
@@ -2,6 +2,7 @@ test_unicode
  Testing Unicode comparisons... done.
  Testing Unicode contains method... done.
  Testing Unicode formatting strings... done.
+Testing builtin unicode()... done.
  Testing builtin codecs... done.
  Testing standard mapping codecs... 0-127... 128-255... done.
  Testing Unicode string concatenation... done.
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 68eae13115d2a3a6072c48183f61490fe35d873a..eff11cfa6a4e218252f48e8126605514aca36982 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -389,6 +389,67 @@ verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10   abc')
  verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103   abc')
  print 'done.'
  
+print 'Testing builtin unicode()...',
+
+# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
+
+verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
+
+class UnicodeSubclass(unicode):
+    pass
+
+verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
+       == u'unicode subclass becomes unicode')
+
+verify(unicode('strings are converted to unicode')
+       == u'strings are converted to unicode')
+
+class UnicodeCompat:
+    def __init__(self, x):
+        self.x = x
+    def __unicode__(self):
+        return self.x
+
+verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
+       == u'__unicode__ compatible objects are recognized')
+
+class StringCompat:
+    def __init__(self, x):
+        self.x = x
+    def __str__(self):
+        return self.x
+
+verify(unicode(StringCompat('__str__ compatible objects are recognized'))
+       == u'__str__ compatible objects are recognized')
+
+# unicode(obj) is compatible to str():
+
+o = StringCompat('unicode(obj) is compatible to str()')
+verify(unicode(o) == u'unicode(obj) is compatible to str()')
+verify(str(o) == 'unicode(obj) is compatible to str()')
+
+for obj in (123, 123.45, 123L):
+    verify(unicode(obj) == unicode(str(obj)))
+
+# unicode(obj, encoding, error) tests (this maps to
+# PyUnicode_FromEncodedObject() at C level)
+
+try:
+    unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
+except TypeError:
+    pass
+else:
+    raise TestFailed, "decoding unicode should NOT be supported"
+
+verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
+       == u'strings are decoded to unicode')
+
+verify(unicode(buffer('character buffers are decoded to unicode'),
+               'utf-8', 'strict')
+       == u'character buffers are decoded to unicode')
+
+print 'done.'
+
  # Test builtin codecs
  print 'Testing builtin codecs...',
  
@@ -437,32 +498,11 @@ verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
  # * strict decoding testing for all of the
  #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
  
-
-
  verify(unicode('hello','ascii') == u'hello')
  verify(unicode('hello','utf-8') == u'hello')
  verify(unicode('hello','utf8') == u'hello')
  verify(unicode('hello','latin-1') == u'hello')
  
-# Compatibility to str():
-class String:
-    x = ''
-    def __str__(self):
-        return self.x
-
-o = String()
-
-o.x = 'abc'
-verify(unicode(o) == u'abc')
-verify(str(o) == 'abc')
-
-o.x = u'abc'
-verify(unicode(o) == u'abc')
-verify(str(o) == 'abc')
-
-for obj in (123, 123.45, 123L):
-    verify(unicode(obj) == unicode(str(obj)))
-
  # Error handling
  try:
      u'Andr\202 x'.encode('ascii')
diff --git a/Misc/NEWS b/Misc/NEWS

index b7ea28ce06717074260d97c9828b33331a7564c4..522ba40778fc962f970551570cb61ec9752bfffc 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -44,7 +44,7 @@ Core and builtins
  - unicode(obj) now behaves more like str(obj), accepting arbitrary
    objects, and calling a __unicode__ method if it exists.
    unicode(obj, encoding) and unicode(obj, encoding, errors) still
-  require an 8-bit string argument.
+  require an 8-bit string or character buffer argument.
  
  - isinstance() now allows any object as the first argument and a
    class, a type or something with a __bases__ tuple attribute for the
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index a29c75b5a34bc900b6161152cc5379137c7172b8..57ef62a7138fe9496022f62471f66a28836f9c99 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -426,8 +426,9 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
  
  #if 0
      /* For b/w compatibility we also accept Unicode objects provided
-       that no encodings is given and then redirect to PyObject_Unicode() 
-       which then applies the additional logic for Unicode subclasses.
+       that no encodings is given and then redirect to
+       PyObject_Unicode() which then applies the additional logic for
+       Unicode subclasses.
  
         NOTE: This API should really only be used for object which
               represent *encoded* Unicode !
author	Marc-André Lemburg <mal@egenix.com>
	Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)
committer	Marc-André Lemburg <mal@egenix.com>
	Fri, 19 Oct 2001 12:02:29 +0000 (12:02 +0000)
Doc/lib/libfuncs.tex		patch \| blob \| history
Lib/test/output/test_unicode		patch \| blob \| history
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history