Patch #462190, patch #464070: Support quoted printable in the binascii module.

author Martin v. Löwis <martin@v.loewis.de>

Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)
diff --git a/Doc/lib/libbinascii.tex b/Doc/lib/libbinascii.tex

index a569a47d321c44f453f2e05f7d81e5c188b62f79..0ce099680a2f0ec1b9373e0ea89efb5f96b71daf 100644 (file)
--- a/Doc/lib/libbinascii.tex
+++ b/Doc/lib/libbinascii.tex
@@ -40,6 +40,24 @@ The length of \var{data} should be at most 57 to adhere to the base64
  standard.
  \end{funcdesc}
  
+\begin{funcdesc}{a2b_qp}{string\optional{, header}}
+Convert a block of quoted-printable data back to binary and return the
+binary data. More than one line may be passed at a time.
+If the optional argument \var{header} is present and true, underscores
+will be decoded as spaces.
+\end{funcdesc}
+
+\begin{funcdesc}{b2a_qp}{data\optional{, quotetabs, istext, header}}
+Convert binary data to a line(s) of \ASCII{} characters in
+quoted-printable encoding.  The return value is the converted line(s).
+If the optional argument \var{quotetabs} is present and true, all tabs
+and spaces will be encoded.  If the optional argument \var{header} is
+present and true, spaces will be encoded as underscores per RFC1522.
+If the optional argument \var{header} is present and false, newline
+characters will be encoded as well, otherwise linefeed conversion might
+corrupt the binary data stream.
+\end{funcdesc}
+
  \begin{funcdesc}{a2b_hqx}{string}
  Convert binhex4 formatted \ASCII{} data to binary, without doing
  RLE-decompression. The string should contain a complete number of
@@ -118,4 +136,6 @@ again.
    \seemodule{binhex}{Support for the binhex format used on the Macintosh.}
  
    \seemodule{uu}{Support for UU encoding used on \UNIX.}
+
+  \seemodule{quopri}{Support for quoted-printable encoding used in MIME email messages. }
  \end{seealso}
diff --git a/Doc/lib/libquopri.tex b/Doc/lib/libquopri.tex

index 4079d2746cfe597401a25093d3ba0e22b5ca2f36..9e7895bd9bdfae35614ee67f6e2645ddb3e76185 100644 (file)
--- a/Doc/lib/libquopri.tex
+++ b/Doc/lib/libquopri.tex
@@ -7,21 +7,27 @@
  
  
  This module performs quoted-printable transport encoding and decoding,
-as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail Extensions)
-Part One''.  The quoted-printable encoding is designed for data where
-there are relatively few nonprintable characters; the base64 encoding
-scheme available via the \refmodule{base64} module is more compact if there
-are many such characters, as when sending a graphics file.
+as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail
+Extensions) Part One: Mechanisms for Specifying and Describing the
+Format of Internet Message Bodies''.  The quoted-printable encoding is
+designed for data where there are relatively few nonprintable
+characters; the base64 encoding scheme available via the
+\refmodule{base64} module is more compact if there are many such
+characters, as when sending a graphics file.
  \indexii{quoted-printable}{encoding}
  \index{MIME!quoted-printable encoding}
  
  
-\begin{funcdesc}{decode}{input, output}
+\begin{funcdesc}{decode}{input, output\optional{,header}}
  Decode the contents of the \var{input} file and write the resulting
  decoded binary data to the \var{output} file.
  \var{input} and \var{output} must either be file objects or objects that
  mimic the file object interface. \var{input} will be read until
  \code{\var{input}.readline()} returns an empty string.
+If the optional argument \var{header} is present and true, underscore
+will be decoded as space. This is used to decode
+``Q''-encoded headers as described in \rfc{1522}: ``MIME (Multipurpose Internet Mail Extensions)
+Part Two: Message Header Extensions for Non-ASCII Text''.
  \end{funcdesc}
  
  \begin{funcdesc}{encode}{input, output, quotetabs}
@@ -36,7 +42,7 @@ when false it leaves them unencoded.  Note that spaces and tabs
  appearing at the end of lines are always encoded, as per \rfc{1521}.
  \end{funcdesc}
  
-\begin{funcdesc}{decodestring}{s}
+\begin{funcdesc}{decodestring}{s\optional{,header}}
  Like \function{decode()}, except that it accepts a source string and
  returns the corresponding decoded string.
  \end{funcdesc}
diff --git a/Lib/quopri.py b/Lib/quopri.py

index f668abf3e191df808eaf4994582e60fea62a678d..0425735603ddf49de42a10e637afcefe208e06e8 100755 (executable)
--- a/Lib/quopri.py
+++ b/Lib/quopri.py
@@ -11,9 +11,14 @@ MAXLINESIZE = 76
  HEX = '0123456789ABCDEF'
  EMPTYSTRING = ''
  
+try:
+  from binascii import a2b_qp, b2a_qp
+except:
+  a2b_qp = None
+  b2a_qp = None
  
  
-def needsquoting(c, quotetabs):
+def needsquoting(c, quotetabs, header):
      """Decide whether a particular character needs to be quoted.
  
      The 'quotetabs' flag indicates whether embedded tabs and spaces should be
@@ -22,6 +27,9 @@ def needsquoting(c, quotetabs):
      """
      if c in ' \t':
          return quotetabs
+    # if header, we have to escape _ because _ is used to escape space
+    if c == '_': 
+        return header
      return c == ESCAPE or not (' ' <= c <= '~')
  
  def quote(c):
@@ -31,14 +39,23 @@ def quote(c):
  
  
  
-def encode(input, output, quotetabs):
+def encode(input, output, quotetabs, header = 0):
      """Read 'input', apply quoted-printable encoding, and write to 'output'.
  
      'input' and 'output' are files with readline() and write() methods.
      The 'quotetabs' flag indicates whether embedded tabs and spaces should be
      quoted.  Note that line-ending tabs and spaces are always encoded, as per
      RFC 1521.
+    The 'header' flag indicates whether we are encoding spaces as _ as per
+    RFC 1522.
      """
+
+    if b2a_qp is not None:
+        data = input.read()
+        odata = b2a_qp(data, quotetabs = quotetabs, header = header)
+        output.write(odata)
+        return
+      
      def write(s, output=output, lineEnd='\n'):
          # RFC 1521 requires that the line ending in a space or tab must have
          # that trailing character encoded.
@@ -60,9 +77,12 @@ def encode(input, output, quotetabs):
              stripped = '\n'
          # Calculate the un-length-limited encoded line
          for c in line:
-            if needsquoting(c, quotetabs):
+            if needsquoting(c, quotetabs, header):
                  c = quote(c)
-            outline.append(c)
+            if header and c == ' ':
+                outline.append('_')
+            else:
+                outline.append(c)
          # First, write out the previous line
          if prevline is not None:
              write(prevline)
@@ -80,19 +100,28 @@ def encode(input, output, quotetabs):
      if prevline is not None:
          write(prevline, lineEnd=stripped)
  
-def encodestring(s, quotetabs=0):
+def encodestring(s, quotetabs = 0, header = 0):
+    if b2a_qp is not None:
+        return b2a_qp(s, quotetabs = quotetabs, header = header)
      from cStringIO import StringIO
      infp = StringIO(s)
      outfp = StringIO()
-    encode(infp, outfp, quotetabs)
+    encode(infp, outfp, quotetabs, header)
      return outfp.getvalue()
  
  
  
-def decode(input, output):
+def decode(input, output, header = 0):
      """Read 'input', apply quoted-printable decoding, and write to 'output'.
+    'input' and 'output' are files with readline() and write() methods.
+    If 'header' is true, decode underscore as space (per RFC 1522)."""
+
+    if a2b_qp is not None:
+        data = input.read()
+        odata = a2b_qp(data, header = header)
+        output.write(odata)
+        return
  
-    'input' and 'output' are files with readline() and write() methods."""
      new = ''
      while 1:
          line = input.readline()
@@ -107,7 +136,9 @@ def decode(input, output):
              partial = 1
          while i < n:
              c = line[i]
-            if c != ESCAPE:
+            if c == '_' and header:
+                new = new + ' '; i = i+1
+            elif c != ESCAPE:
                  new = new + c; i = i+1
              elif i+1 == n and not partial:
                  partial = 1; break
@@ -123,11 +154,13 @@ def decode(input, output):
      if new:
          output.write(new)
  
-def decodestring(s):
+def decodestring(s, header = 0):
+    if a2b_qp is not None:
+        return a2b_qp(s, header = header)
      from cStringIO import StringIO
      infp = StringIO(s)
      outfp = StringIO()
-    decode(infp, outfp)
+    decode(infp, outfp, header = header)
      return outfp.getvalue()
  
  
diff --git a/Lib/test/test_quopri.py b/Lib/test/test_quopri.py

index 0e997271fc48e5f19955538dd85b2d78fcf0a857..2497705a2d8bfa3361bda9f79cd50a4b1bc5c6cc 100644 (file)
--- a/Lib/test/test_quopri.py
+++ b/Lib/test/test_quopri.py
@@ -104,6 +104,12 @@ zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz''')
          ('hello\tworld', 'hello=09world'),
          )
  
+    # These are used in the "header=1" tests.
+    HSTRINGS = (
+        ('hello world', 'hello_world'),
+        ('hello_world', 'hello=5Fworld'),
+        )
+
      def test_encodestring(self):
          for p, e in self.STRINGS:
              self.assert_(encodestring(p) == e)
@@ -135,6 +141,13 @@ zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz''')
              self.assert_(encodestring(p, quotetabs=1) == e)
              self.assert_(decodestring(e) == p)
  
+    def test_encode_header(self):
+        for p, e in self.HSTRINGS:
+            self.assert_(encodestring(p, header = 1) == e)
+
+    def test_decode_header(self):
+        for p, e in self.HSTRINGS:
+            self.assert_(decodestring(e, header = 1) == p)
  
  def test_main():
      test_support.run_unittest(QuopriTestCase)
diff --git a/Misc/NEWS b/Misc/NEWS

index 1b2b76b9578859248ccc0231d7085edca69e8b79..4b6ae5b7f6d47ba8b74941e110734f5a6c39b644 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -6,8 +6,13 @@ Type/class unification and new-style classes
  
  Core
  
+- binascii has now two quopri support functions, a2b_qp and b2a_qp.
+
  Library
  
+- quopri's encode and decode methods take an optional header parameter,
+  which indicates whether output is intended for the header 'Q' encoding.
+
  Tools
  
  Build
diff --git a/Modules/binascii.c b/Modules/binascii.c

index 00a28052d9781a9d44ce992ea23d078aab9c674f..484f656d14b9edcba22d69accf6ee71c5c556ff0 100644 (file)
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -42,6 +42,15 @@
  ** does make the performance sub-optimal. Oh well, too bad...
  **
  ** Jack Jansen, CWI, July 1995.
+** 
+** Added support for quoted-printable encoding, based on rfc 1521 et al
+** quoted-printable encoding specifies that non printable characters (anything 
+** below 32 and above 126) be encoded as =XX where XX is the hexadecimal value
+** of the character.  It also specifies some other behavior to enable 8bit data
+** in a mail message with little difficulty (maximum line sizes, protecting 
+** some cases of whitespace, etc).    
+**
+** Brandon Long, September 2001.
  */
  
  
@@ -971,6 +980,289 @@ static char doc_unhexlify[] =
  hexstr must contain an even number of hex digits (upper or lower case).\n\
  This function is also available as \"unhexlify()\"";
  
+static int table_hex[128] = {
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+   0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
+};
+
+#define hexval(c) table_hex[(unsigned int)(c)]
+
+#define MAXLINESIZE 76
+
+static char doc_a2b_qp[] = "Decode a string of qp-encoded data";
+
+static PyObject* 
+binascii_a2b_qp(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+       unsigned int in, out;
+       char ch;
+       unsigned char *data, *odata;
+       unsigned int datalen = 0;
+       PyObject *rv;
+       static char *kwlist[] = {"data", "header", NULL};
+       int header = 0;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|i", kwlist, &data, 
+             &datalen, &header))
+               return NULL;
+
+       /* We allocate the output same size as input, this is overkill */
+       odata = (char *) calloc(1, datalen);
+
+       if (odata == NULL) {
+               PyErr_NoMemory();
+               return NULL;
+       }
+
+       in = out = 0;
+       while (in < datalen) {
+               if (data[in] == '=') {
+                       in++;
+                       if (in >= datalen) break;
+                       /* Soft line breaks */
+                       if ((data[in] == '\n') || (data[in] == '\r') || 
+                           (data[in] == ' ') || (data[in] == '\t')) {
+                               if (data[in] != '\n') {
+                                       while (in < datalen && data[in] != '\n') in++;
+                               }
+                               if (in < datalen) in++;
+                       }
+                       else if (data[in] == '=') {
+                               /* broken case from broken python qp */
+                               odata[out++] = '=';
+                               in++;
+                       }
+                       else if (((data[in] >= 'A' && data[in] <= 'F') || 
+                                 (data[in] >= 'a' && data[in] <= 'f') ||
+                                 (data[in] >= '0' && data[in] <= '9')) &&
+                                ((data[in+1] >= 'A' && data[in+1] <= 'F') ||
+                                 (data[in+1] >= 'a' && data[in+1] <= 'f') ||
+                                 (data[in+1] >= '0' && data[in+1] <= '9'))) {
+                               /* hexval */
+                               ch = hexval(data[in]) << 4;
+                               in++;
+                               ch |= hexval(data[in]);
+                               in++;
+                               odata[out++] = ch;
+                       }
+                       else {
+                         odata[out++] = '=';
+                       }
+               }
+               else if (header && data[in] == '_') {
+                       odata[out++] = ' ';
+                       in++;
+               }
+               else {
+                       odata[out] = data[in];
+                       in++;
+                       out++;
+               }
+       }
+       if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+               free (odata);
+               return NULL;
+       }
+       free (odata);
+       return rv;
+}
+
+static int 
+to_hex (unsigned char ch, unsigned char *s)
+{
+       unsigned int uvalue = ch;
+
+       s[1] = "0123456789ABCDEF"[uvalue % 16];
+       uvalue = (uvalue / 16);
+       s[0] = "0123456789ABCDEF"[uvalue % 16];
+       return 0;
+}
+
+static char doc_b2a_qp[] = 
+"b2a_qp(data, quotetabs=0, istext=1, header=0) -> s; \n\
+ Encode a string using quoted-printable encoding. \n\
+\n\
+On encoding, when istext is set, newlines are not encoded, and white \n\
+space at end of lines is.  When istext is not set, \\r and \\n (CR/LF) are \n\
+both encoded.  When quotetabs is set, space and tabs are encoded.";
+
+/* XXX: This is ridiculously complicated to be backward compatible
+ * (mostly) with the quopri module.  It doesn't re-create the quopri
+ * module bug where text ending in CRLF has the CR encoded */
+static PyObject* 
+binascii_b2a_qp (PyObject *self, PyObject *args, PyObject *kwargs)
+{
+       unsigned int in, out;
+       unsigned char *data, *odata;
+       unsigned int datalen = 0, odatalen = 0;
+       PyObject *rv;
+       unsigned int linelen = 0;
+       static char *kwlist[] = {"data", "quotetabs", "istext", "header", NULL};
+       int istext = 1;
+       int quotetabs = 0;
+       int header = 0;
+       unsigned char ch;
+       int crlf = 0;
+       unsigned char *p;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|iii", kwlist, &data, 
+             &datalen, &quotetabs, &istext, &header))
+               return NULL;
+
+       /* See if this string is using CRLF line ends */
+       /* XXX: this function has the side effect of converting all of
+        * the end of lines to be the same depending on this detection
+        * here */
+       p = strchr(data, '\n');
+       if ((p != NULL) && (p > data) && (*(p-1) == '\r'))
+               crlf = 1;
+
+       /* First, scan to see how many characters need to be encoded */
+       in = 0;
+       while (in < datalen) {
+               if ((data[in] > 126) || 
+                   (data[in] == '=') ||
+                   (header && data[in] == '_') ||
+                   ((data[in] == '.') && (linelen == 1)) ||
+                   (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+                   ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+                   ((data[in] < 33) && 
+                    (data[in] != '\r') && (data[in] != '\n') && 
+                    (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+               {
+                       if ((linelen + 3) >= MAXLINESIZE) {
+                               linelen = 0;
+                               if (crlf)
+                                       odatalen += 3;
+                               else
+                                       odatalen += 2;
+                       }
+                       linelen += 3;
+                       odatalen += 3;
+                       in++;
+               }
+               else {
+                       if (istext && 
+                           ((data[in] == '\n') ||
+                            ((in+1 < datalen) && (data[in] == '\r') &&
+                            (data[in+1] == '\n'))))
+                       {
+                               linelen = 0;
+                               /* Protect against whitespace on end of line */
+                               if (in && ((data[in-1] == ' ') || (data[in-1] == '\t')))
+                                       odatalen += 2;
+                               if (crlf)
+                                       odatalen += 2;
+                               else
+                                       odatalen += 1;
+                               if (data[in] == '\r')
+                                       in += 2;
+                               else
+                                       in++;
+                       }
+                       else {
+                               if ((in + 1 != datalen) && 
+                                   (data[in+1] != '\n') &&
+                                   (linelen + 1) >= MAXLINESIZE) {
+                                       linelen = 0;
+                                       if (crlf)
+                                               odatalen += 3;
+                                       else
+                                               odatalen += 2;
+                               }
+                               linelen++;
+                               odatalen++;
+                               in++;
+                       }
+               }
+       }
+
+       odata = (char *) calloc(1, odatalen);
+
+       if (odata == NULL) {
+               PyErr_NoMemory();
+               return NULL;
+       }
+
+       in = out = linelen = 0;
+       while (in < datalen) {
+               if ((data[in] > 126) || 
+                   (data[in] == '=') ||
+                   (header && data[in] == '_') ||
+                   ((data[in] == '.') && (linelen == 1)) ||
+                   (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+                   ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+                   ((data[in] < 33) && 
+                    (data[in] != '\r') && (data[in] != '\n') && 
+                    (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+               {
+                       if ((linelen + 3 )>= MAXLINESIZE) {
+                               odata[out++] = '=';
+                               if (crlf) odata[out++] = '\r';
+                               odata[out++] = '\n';
+                               linelen = 0;
+                       }
+                       odata[out++] = '=';
+                       to_hex(data[in], &odata[out]);
+                       out += 2;
+                       in++;
+                       linelen += 3;
+               }
+               else {
+                       if (istext && 
+                           ((data[in] == '\n') ||
+                            ((in+1 < datalen) && (data[in] == '\r') &&
+                            (data[in+1] == '\n'))))
+                       {
+                               linelen = 0;
+                               /* Protect against whitespace on end of line */
+                               if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) {
+                                       ch = odata[out-1];
+                                       odata[out-1] = '=';
+                                       to_hex(ch, &odata[out]);
+                                       out += 2;
+                               }
+                                       
+                               if (crlf) odata[out++] = '\r';
+                               odata[out++] = '\n';
+                               if (data[in] == '\r')
+                                       in += 2;
+                               else
+                                       in++;
+                       }
+                       else {
+                               if ((in + 1 != datalen) && 
+                                   (data[in+1] != '\n') &&
+                                   (linelen + 1) >= MAXLINESIZE) {
+                                       odata[out++] = '=';
+                                       if (crlf) odata[out++] = '\r';
+                                       odata[out++] = '\n';
+                                       linelen = 0;
+                               }
+                               linelen++;
+                               if (header && data[in] == ' ') {
+                                       odata[out++] = '_';
+                                       in++;
+                               }
+                               else {
+                                       odata[out++] = data[in++];
+                               }
+                       }
+               }
+       }
+       if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+               free (odata);
+               return NULL;
+       }
+       free (odata);
+       return rv;
+}
  
  /* List of functions defined in the module */
  
@@ -990,6 +1282,10 @@ static struct PyMethodDef binascii_module_methods[] = {
          doc_rledecode_hqx},
         {"crc_hqx",    binascii_crc_hqx,    METH_VARARGS, doc_crc_hqx},
         {"crc32",      binascii_crc32,      METH_VARARGS, doc_crc32},
+       {"a2b_qp", (PyCFunction)binascii_a2b_qp, METH_VARARGS | METH_KEYWORDS, 
+         doc_a2b_qp},
+       {"b2a_qp", (PyCFunction)binascii_b2a_qp, METH_VARARGS | METH_KEYWORDS, 
+          doc_b2a_qp},
         {NULL, NULL}                         /* sentinel */
  };
author	Martin v. Löwis <martin@v.loewis.de>
	Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sun, 30 Sep 2001 20:32:11 +0000 (20:32 +0000)
Doc/lib/libbinascii.tex		patch \| blob \| history
Doc/lib/libquopri.tex		patch \| blob \| history
Lib/quopri.py		patch \| blob \| history
Lib/test/test_quopri.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Modules/binascii.c		patch \| blob \| history