]> granicus.if.org Git - python/commitdiff
cPickle.c: Full support for the new LONG1 and LONG4. Added comments.
authorTim Peters <tim.peters@gmail.com>
Sun, 2 Feb 2003 02:57:53 +0000 (02:57 +0000)
committerTim Peters <tim.peters@gmail.com>
Sun, 2 Feb 2003 02:57:53 +0000 (02:57 +0000)
Assorted code cleanups; e.g., sizeof(char) is 1 by definition, so there's
no need to do things like multiply by sizeof(char) in hairy malloc
arguments.  Fixed an undetected-overflow bug in readline_file().

longobject.c:  Fixed a really stupid bug in the new _PyLong_NumBits.

pickle.py:  Fixed stupid bug in save_long():  When proto is 2, it
wrote LONG1 or LONG4, but forgot to return then -- it went on to
append the proto 1 LONG opcode too.
Fixed equally stupid cancelling bugs in load_long1() and
load_long4():  they *returned* the unpickled long instead of pushing
it on the stack.  The return values were ignored.  Tests passed
before only because save_long() pickled the long twice.

Fixed bugs in encode_long().

Noted that decode_long() is quadratic-time despite our hopes,
because long(string, 16) is still quadratic-time in len(string).
It's hex() that's linear-time.  I don't know a way to make decode_long()
linear-time in Python, short of maybe transforming the 256's-complement
bytes into marshal's funky internal format, and letting marshal decode
that.  It would be more valuable to make long(string, 16) linear time.

pickletester.py:  Added a global "protocols" vector so tests can try
all the protocols in a sane way.  Changed test_ints() and test_unicode()
to do so.  Added a new test_long(), but the tail end of it is disabled
because it "takes forever" under pickle.py (but runs very quickly under
cPickle:  cPickle proto 2 for longs is linear-time).

Lib/pickle.py
Lib/test/pickletester.py
Modules/cPickle.c
Objects/longobject.c

index 92b4802e7e3736d1ec3f4f55acd46c01864ffd36..ba0e38b08e6dca35bdcee96c68aec71958a7dd6a 100644 (file)
@@ -553,6 +553,7 @@ class Pickler:
                 self.write(LONG1 + chr(n) + bytes)
             else:
                 self.write(LONG4 + pack("<i", n) + bytes)
+            return
         self.write(LONG + `obj` + '\n')
     dispatch[LongType] = save_long
 
@@ -1042,13 +1043,13 @@ class Unpickler:
     def load_long1(self):
         n = ord(self.read(1))
         bytes = self.read(n)
-        return decode_long(bytes)
+        self.append(decode_long(bytes))
     dispatch[LONG1] = load_long1
 
     def load_long4(self):
         n = mloads('i' + self.read(4))
         bytes = self.read(n)
-        return decode_long(bytes)
+        self.append(decode_long(bytes))
     dispatch[LONG4] = load_long4
 
     def load_float(self):
@@ -1404,24 +1405,31 @@ def encode_long(x):
         njunkchars = 2 + ashex.endswith('L')
         nibbles = len(ashex) - njunkchars
         if nibbles & 1:
-            # need an even # of nibbles for unhexlify
+            # Extend to a full byte.
             nibbles += 1
         nbits = nibbles * 4
         x += 1L << nbits
         assert x > 0
         ashex = hex(x)
-        if x >> (nbits - 1) == 0:
+        njunkchars = 2 + ashex.endswith('L')
+        newnibbles = len(ashex) - njunkchars
+        if newnibbles < nibbles:
+            ashex = "0x" + "0" * (nibbles - newnibbles) + ashex[2:]
+        if int(ashex[2], 16) < 8:
             # "looks positive", so need a byte of sign bits
-            ashex = "0xff" + x[2:]
+            ashex = "0xff" + ashex[2:]
 
     if ashex.endswith('L'):
         ashex = ashex[2:-1]
     else:
         ashex = ashex[2:]
-    assert len(ashex) & 1 == 0
+    assert len(ashex) & 1 == 0, (x, ashex)
     binary = _binascii.unhexlify(ashex)
     return binary[::-1]
 
+# XXX OOPS!  This is still quadratic-time.  While hex(n) is linear-time
+# XXX in the # of digits in n, long(s, 16) is still quadratic-time
+# XXX in len(s).
 def decode_long(data):
     r"""Decode a long from a two's complement little-endian binary string.
 
@@ -1445,7 +1453,7 @@ def decode_long(data):
     if nbytes == 0:
         return 0L
     ashex = _binascii.hexlify(data[::-1])
-    n = long(ashex, 16)
+    n = long(ashex, 16) # quadratic time
     if data[-1] >= '\x80':
         n -= 1L << (nbytes * 8)
     return n
index 8211dcf58cf3ee395d0822d5205618d6fee1f894..6615307da6e31a9248fcdd572c74244d582f4e43 100644 (file)
@@ -1,6 +1,11 @@
 import unittest
 from test.test_support import TestFailed, have_unicode, TESTFN
 
+# Tests that try a number of pickle protocols should have a
+#     for proto in protocols:
+# kind of outer loop.  Bump the 3 to 4 if/when protocol 3 is invented.
+protocols = range(3)
+
 class C:
     def __cmp__(self, other):
         return cmp(self.__dict__, other.__dict__)
@@ -28,6 +33,9 @@ class metaclass(type):
 class use_metaclass(object):
     __metaclass__ = metaclass
 
+# DATA and BINDATA are the protocol 0 and protocol 1 pickles of the object
+# returned by create_data().
+
 # break into multiple strings to avoid confusing font-lock-mode
 DATA = """(lp1
 I0
@@ -210,20 +218,22 @@ class AbstractPickleTests(unittest.TestCase):
         def test_unicode(self):
             endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
                         unicode('<\n>'),  unicode('<\\>')]
-            for u in endcases:
-                p = self.dumps(u)
-                u2 = self.loads(p)
-                self.assertEqual(u2, u)
+            for proto in protocols:
+                for u in endcases:
+                    p = self.dumps(u, proto)
+                    u2 = self.loads(p)
+                    self.assertEqual(u2, u)
 
     def test_ints(self):
         import sys
-        n = sys.maxint
-        while n:
-            for expected in (-n, n):
-                s = self.dumps(expected)
-                n2 = self.loads(s)
-                self.assertEqual(expected, n2)
-            n = n >> 1
+        for proto in protocols:
+            n = sys.maxint
+            while n:
+                for expected in (-n, n):
+                    s = self.dumps(expected, proto)
+                    n2 = self.loads(s)
+                    self.assertEqual(expected, n2)
+                n = n >> 1
 
     def test_maxint64(self):
         maxint64 = (1L << 63) - 1
@@ -235,6 +245,34 @@ class AbstractPickleTests(unittest.TestCase):
         data = 'I' + str(maxint64) + 'JUNK\n.'
         self.assertRaises(ValueError, self.loads, data)
 
+    def test_long(self):
+        for proto in protocols:
+            # 256 bytes is where LONG4 begins
+            for nbits in 1, 8, 8*254, 8*255, 8*256, 8*257:
+                nbase = 1L << nbits
+                for npos in nbase-1, nbase, nbase+1:
+                    for n in npos, -npos:
+                        pickle = self.dumps(n, proto)
+                        got = self.loads(pickle)
+                        self.assertEqual(n, got)
+        # Try a monster.  This is quadratic-time in protos 0 & 1, so don't
+        # bother with those.
+        # XXX Damn.  pickle.py is still quadratic-time here, due to
+        # XXX long(string, 16).  cPickle runs this in an eyeblink, but I
+        # XXX gave up waiting for pickle.py to get beyond "loading".  Giving
+        # XXX up for now.
+        return
+        print "building long"
+        nbase = long("deadbeeffeedface", 16)
+        nbase += nbase << 1000000
+        for n in nbase, -nbase:
+            print "dumping"
+            p = self.dumps(n, 2)
+            print "loading"
+            got = self.loads(p)
+            print "checking"
+            self.assertEqual(n, got)
+
     def test_reduce(self):
         pass
 
index 4364f4d9da8b4edbeb7f93bf5b0533f109630c34..b59f573dfee44ee1650ad1a71604c3a962355127 100644 (file)
@@ -464,7 +464,7 @@ write_other(Picklerobject *self, char *s, int  n)
 
 
 static int
-read_file(Unpicklerobject *self, char **s, int  n)
+read_file(Unpicklerobject *self, char **s, int n)
 {
        size_t nbytesread;
 
@@ -472,7 +472,7 @@ read_file(Unpicklerobject *self, char **s, int  n)
                int size;
 
                size = ((n < 32) ? 32 : n);
-               if (!( self->buf = (char *)malloc(size * sizeof(char))))  {
+               if (!( self->buf = (char *)malloc(size))) {
                        PyErr_NoMemory();
                        return -1;
                }
@@ -480,12 +480,11 @@ read_file(Unpicklerobject *self, char **s, int  n)
                self->buf_size = size;
        }
        else if (n > self->buf_size) {
-               self->buf = (char *)realloc(self->buf, n * sizeof(char));
+               self->buf = (char *)realloc(self->buf, n);
                if (!self->buf)  {
                        PyErr_NoMemory();
                        return -1;
                }
-
                self->buf_size = n;
        }
 
@@ -514,16 +513,16 @@ readline_file(Unpicklerobject *self, char **s)
        int i;
 
        if (self->buf_size == 0) {
-               if (!( self->buf = (char *)malloc(40 * sizeof(char))))  {
+               if (!( self->buf = (char *)malloc(40))) {
                        PyErr_NoMemory();
                        return -1;
                }
-
                self->buf_size = 40;
        }
 
        i = 0;
        while (1) {
+               int bigger;
                for (; i < (self->buf_size - 1); i++) {
                        if (feof(self->fp) ||
                            (self->buf[i] = getc(self->fp)) == '\n') {
@@ -532,14 +531,17 @@ readline_file(Unpicklerobject *self, char **s)
                                return i + 1;
                        }
                }
-               self->buf = (char *)realloc(self->buf,
-                                           (self->buf_size * 2) * sizeof(char));
+               bigger = self->buf_size << 1;
+               if (bigger <= 0) {      /* overflow */
+                       PyErr_NoMemory();
+                       return -1;
+               }
+               self->buf = (char *)realloc(self->buf, bigger);
                if (!self->buf)  {
                        PyErr_NoMemory();
                        return -1;
                }
-
-               self->buf_size *= 2;
+               self->buf_size = bigger;
        }
 }
 
@@ -620,14 +622,18 @@ readline_other(Unpicklerobject *self, char **s)
        return str_size;
 }
 
-
+/* Copy the first n bytes from s into newly malloc'ed memory, plus a
+ * trailing 0 byte.  Return a pointer to that, or NULL if out of memory.
+ * The caller is responsible for free()'ing the return value.
+ */
 static char *
-pystrndup(char *s, int l)
+pystrndup(char *s, int n)
 {
-       char *r;
-       if (!( r=malloc((l+1)*sizeof(char))))  return (char*)PyErr_NoMemory();
-       memcpy(r,s,l);
-       r[l]=0;
+       char *r = (char *)malloc(n+1);
+       if (r == NULL)
+               return (char*)PyErr_NoMemory();
+       memcpy(r, s, n);
+       r[n] = 0;
        return r;
 }
 
@@ -1012,11 +1018,93 @@ save_int(Picklerobject *self, PyObject *args)
 static int
 save_long(Picklerobject *self, PyObject *args)
 {
-       int size, res = -1;
-       PyObject *repr = 0;
+       int size;
+       int res = -1;
+       PyObject *repr = NULL;
 
        static char l = LONG;
 
+       if (self->proto >= 2) {
+               /* Linear-time pickling. */
+               size_t nbits;
+               size_t nbytes;
+               unsigned char *pdata;
+               char c_str[5];
+               int i;
+               int sign = _PyLong_Sign(args);
+
+               if (sign == 0) {
+                       /* It's 0 -- an empty bytestring. */
+                       c_str[0] = LONG1;
+                       c_str[1] = 0;
+                       i = self->write_func(self, c_str, 2);
+                       if (i < 0) goto finally;
+                       res = 0;
+                       goto finally;
+               }
+               nbits = _PyLong_NumBits(args);
+               if (nbits == (size_t)-1 && PyErr_Occurred())
+                       goto finally;
+               /* How many bytes do we need?  There are nbits >> 3 full
+                * bytes of data, and nbits & 7 leftover bits.  If there
+                * are any leftover bits, then we clearly need another
+                * byte.  Wnat's not so obvious is that we *probably*
+                * need another byte even if there aren't any leftovers:
+                * the most-significant bit of the most-significant byte
+                * acts like a sign bit, and it's usually got a sense
+                * opposite of the one we need.  The exception is longs
+                * of the form -(2**(8*j-1)) for j > 0.  Such a long is
+                * its own 256's-complement, so has the right sign bit
+                * even without the extra byte.  That's a pain to check
+                * for in advance, though, so we always grab an extra
+                * byte at the start, and cut it back later if possible.
+                */
+               nbytes = (nbits >> 3) + 1;
+               if ((int)nbytes < 0 || (size_t)(int)nbytes != nbytes) {
+                       PyErr_SetString(PyExc_OverflowError, "long too large "
+                               "to pickle");
+                       goto finally;
+               }
+               repr = PyString_FromStringAndSize(NULL, (int)nbytes);
+               if (repr == NULL) goto finally;
+               pdata = (unsigned char *)PyString_AS_STRING(repr);
+               i = _PyLong_AsByteArray((PyLongObject *)args,
+                               pdata, nbytes,
+                               1 /* little endian */, 1 /* signed */);
+               if (i < 0) goto finally;
+               /* If the long is negative, this may be a byte more than
+                * needed.  This is so iff the MSB is all redundant sign
+                * bits.
+                */
+               if (sign < 0 && nbytes > 1 && pdata[nbytes - 1] == 0xff &&
+                   (pdata[nbytes - 2] & 0x80) != 0)
+                       --nbytes;
+
+               if (nbytes < 256) {
+                       c_str[0] = LONG1;
+                       c_str[1] = (char)nbytes;
+                       size = 2;
+               }
+               else {
+                       c_str[0] = LONG4;
+                       size = (int)nbytes;
+                       for (i = 1; i < 5; i++) {
+                               c_str[i] = (char)(size & 0xff);
+                               size >>= 8;
+                       }
+                       size = 5;
+               }
+               i = self->write_func(self, c_str, size);
+               if (i < 0) goto finally;
+               i = self->write_func(self, (char *)pdata, (int)nbytes);
+               if (i < 0) goto finally;
+               res = 0;
+               goto finally;
+       }
+
+       /* proto < 2:  write the repr and newline.  This is quadratic-time
+        * (in the number of digits), in both directions.
+        */
        if (!( repr = PyObject_Repr(args)))
                goto finally;
 
@@ -1038,7 +1126,6 @@ save_long(Picklerobject *self, PyObject *args)
 
   finally:
        Py_XDECREF(repr);
-
        return res;
 }
 
@@ -2687,9 +2774,13 @@ load_int(Unpicklerobject *self)
        return res;
 }
 
-
+/* s contains x bytes of a little-endian integer.  Return its value as a
+ * C int.  Obscure:  when x is 1 or 2, this is an unsigned little-endian
+ * int, but when x is 4 it's a signed one.  This is an historical source
+ * of x-platform bugs.
+ */
 static long
-calc_binint(char *s, int  x)
+calc_binint(char *s, int x)
 {
        unsigned char c;
        int i;
@@ -2786,6 +2877,45 @@ load_long(Unpicklerobject *self)
        return res;
 }
 
+/* 'size' bytes contain the # of bytes of little-endian 256's-complement
+ * data following.
+ */
+static int
+load_counted_long(Unpicklerobject *self, int size)
+{
+       int i;
+       char *nbytes;
+       unsigned char *pdata;
+       PyObject *along;
+
+       assert(size == 1 || size == 4);
+       i = self->read_func(self, &nbytes, size);
+       if (i < 0) return -1;
+
+       size = calc_binint(nbytes, size);
+       if (size < 0) {
+               /* Corrupt or hostile pickle -- we never write one like
+                * this.
+                */
+               PyErr_SetString(PyExc_ValueError, "LONG pickle has negative "
+                               "byte count");
+               return -1;
+       }
+
+       if (size == 0)
+               along = PyLong_FromLong(0L);
+       else {
+               /* Read the raw little-endian bytes & convert. */
+               i = self->read_func(self, &(char *)pdata, size);
+               if (i < 0) return -1;
+               along = _PyLong_FromByteArray(pdata, (size_t)size,
+                               1 /* little endian */, 1 /* signed */);
+       }
+       if (along == NULL)
+               return -1;
+       PDATA_PUSH(self->stack, along, -1);
+       return 0;
+}
 
 static int
 load_float(Unpicklerobject *self)
@@ -3784,6 +3914,16 @@ load(Unpicklerobject *self)
                                break;
                        continue;
 
+               case LONG1:
+                       if (load_counted_long(self, 1) < 0)
+                               break;
+                       continue;
+
+               case LONG4:
+                       if (load_counted_long(self, 4) < 0)
+                               break;
+                       continue;
+
                case FLOAT:
                        if (load_float(self) < 0)
                                break;
index 78357843223ba11a6c414a14ba586421195e2f2a..9d7243f7e018028ee3f1db8fef61839a8fc87caf 100644 (file)
@@ -264,13 +264,13 @@ int
 _PyLong_Sign(PyObject *vv)
 {
        PyLongObject *v = (PyLongObject *)vv;
-       const int ndigits = v->ob_size;
+       const int ndigits = ABS(v->ob_size);
 
        assert(v != NULL);
        assert(PyLong_Check(v));
        assert(ndigits == 0 || v->ob_digit[ndigits - 1] != 0);
 
-       return ndigits == 0 ? 0 : (ndigits < 0 ? -1 : 1);
+       return v->ob_size == 0 ? 0 : (v->ob_size < 0 ? -1 : 1);
 }
 
 size_t