]> granicus.if.org Git - python/commitdiff
#9286: Fix the rfc822 parser to preserve whitespace in address local part.
authorR. David Murray <rdmurray@bitdance.com>
Sat, 18 Dec 2010 18:25:38 +0000 (18:25 +0000)
committerR. David Murray <rdmurray@bitdance.com>
Sat, 18 Dec 2010 18:25:38 +0000 (18:25 +0000)
Such addresses are not RFC compliant except under the 'obsolete syntax'
rules, but before this fix the whitespace was dropped from the input,
concatenating the pieces.  That breaks one of the principles of the
email package, that of preserving the input as much as possible.
It also denies the application program the opportunity to apply its
own heuristics to interpretation of such non-compliant addresses.

It is possible users of the email package were depending on the local
part always being a single token, so this fix will not be backported.

Lib/email/_parseaddr.py
Lib/email/test/test_email.py
Misc/NEWS

index 3bd4ba44030e426a32be73c49d4aed1b21f7c7ed..699d418b3feb8241003b4890270adfe4862dbfa7 100644 (file)
@@ -199,14 +199,18 @@ class AddrlistClass:
         self.commentlist = []
 
     def gotonext(self):
-        """Parse up to the start of the next address."""
+        """Skip white space and extract comments."""
+        wslist = []
         while self.pos < len(self.field):
             if self.field[self.pos] in self.LWS + '\n\r':
+                if self.field[self.pos] not in '\n\r':
+                    wslist.append(self.field[self.pos])
                 self.pos += 1
             elif self.field[self.pos] == '(':
                 self.commentlist.append(self.getcomment())
             else:
                 break
+        return EMPTYSTRING.join(wslist)
 
     def getaddrlist(self):
         """Parse all addresses.
@@ -319,16 +323,24 @@ class AddrlistClass:
 
         self.gotonext()
         while self.pos < len(self.field):
+            preserve_ws = True
             if self.field[self.pos] == '.':
+                if aslist and not aslist[-1].strip():
+                    aslist.pop()
                 aslist.append('.')
                 self.pos += 1
+                preserve_ws = False
             elif self.field[self.pos] == '"':
                 aslist.append('"%s"' % quote(self.getquote()))
             elif self.field[self.pos] in self.atomends:
+                if aslist and not aslist[-1].strip():
+                    aslist.pop()
                 break
             else:
                 aslist.append(self.getatom())
-            self.gotonext()
+            ws = self.gotonext()
+            if preserve_ws and ws:
+                aslist.append(ws)
 
         if self.pos >= len(self.field) or self.field[self.pos] != '@':
             return EMPTYSTRING.join(aslist)
index 78fb9616152e2fbad01090121bba12acdd1893a4..e5eece2720ce984b5eaa02d01d0b7c45bcc6ebc9 100644 (file)
@@ -2342,6 +2342,24 @@ class TestMiscellaneous(TestEmailBase):
         eq(utils.parseaddr('"\\\\"example\\\\" example"@example.com'),
           ('', '"\\\\"example\\\\" example"@example.com'))
 
+    def test_parseaddr_preserves_spaces_in_local_part(self):
+        # issue 9286.  A normal RFC5322 local part should not contain any
+        # folding white space, but legacy local parts can (they are a sequence
+        # of atoms, not dotatoms).  On the other hand we strip whitespace from
+        # before the @ and around dots, on the assumption that the whitespace
+        # around the punctuation is a mistake in what would otherwise be
+        # an RFC5322 local part.  Leading whitespace is, usual, stripped as well.
+        self.assertEqual(('', "merwok wok@xample.com"),
+            utils.parseaddr("merwok wok@xample.com"))
+        self.assertEqual(('', "merwok  wok@xample.com"),
+            utils.parseaddr("merwok  wok@xample.com"))
+        self.assertEqual(('', "merwok  wok@xample.com"),
+            utils.parseaddr(" merwok  wok  @xample.com"))
+        self.assertEqual(('', 'merwok"wok"  wok@xample.com'),
+            utils.parseaddr('merwok"wok"  wok@xample.com'))
+        self.assertEqual(('', 'merwok.wok.wok@xample.com'),
+            utils.parseaddr('merwok. wok .  wok@xample.com'))
+
     def test_multiline_from_comment(self):
         x = """\
 Foo
index 0e90db27db8ce7776198dd2c1bffb30b1fe9e880..9428d1b6f6a4d1e621cac3a0c4e4c94c102ac15d 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -23,6 +23,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #9286: email.utils.parseaddr no longer concatenates blank-separated
+  words in the local part of email addresses, thereby preserving the input.
+
 - Issue #6791: Limit header line length (to 65535 bytes) in http.client
   and http.server, to avoid denial of services from the other party.