New parser. Next up, making the current parser use this parser

author Anthony Baxter <anthonybaxter@gmail.com>

Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)

committer Anthony Baxter <anthonybaxter@gmail.com>

Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)
author Anthony Baxter <anthonybaxter@gmail.com>
Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)
committer Anthony Baxter <anthonybaxter@gmail.com>
Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)
diff --git a/Lib/email/FeedParser.py b/Lib/email/FeedParser.py

new file mode 100644 (file)

index 0000000..a82d305
--- /dev/null
+++ b/Lib/email/FeedParser.py
@@ -0,0 +1,362 @@
+# A new Feed-style Parser
+
+from email import Errors, Message
+import re
+
+NLCRE = re.compile('\r\n|\r|\n')
+
+EMPTYSTRING = ''
+NL = '\n'
+
+NeedMoreData = object()
+
+class FeedableLumpOfText:
+    "A file-like object that can have new data loaded into it"
+
+    def __init__(self):
+        self._partial = ''
+        self._done = False
+        # _pending is a list of lines, in reverse order
+        self._pending = []
+
+    def readline(self):
+        """ Return a line of data.
+
+            If data has been pushed back with unreadline(), the most recently
+            returned unreadline()d data will be returned.
+        """
+        if not self._pending:
+            if self._done:
+                return ''
+            return NeedMoreData
+        return self._pending.pop()
+
+    def unreadline(self, line):
+        """ Push a line back into the object. 
+        """
+        self._pending.append(line)
+
+    def peekline(self):
+        """ Non-destructively look at the next line """
+        if not self._pending:
+            if self._done:
+                return ''
+            return NeedMoreData
+        return self._pending[-1]
+
+
+    # for r in self._input.readuntil(regexp):
+    #     if r is NeedMoreData:
+    #         yield NeedMoreData
+    #     preamble, matchobj = r
+    def readuntil(self, matchre, afterblank=False, includematch=False):
+        """ Read a line at a time until we get the specified RE. 
+
+            Returns the text up to (and including, if includematch is true) the 
+            matched text, and the RE match object. If afterblank is true, 
+            there must be a blank line before the matched text. Moves current 
+            filepointer to the line following the matched line. If we reach 
+            end-of-file, return what we've got so far, and return None as the
+            RE match object.
+        """
+        prematch = []
+        blankseen = 0
+        while 1: 
+            if not self._pending:
+                if self._done:
+                    # end of file
+                    yield EMPTYSTRING.join(prematch), None
+                else:
+                    yield NeedMoreData
+                continue
+            line = self._pending.pop()
+            if afterblank:
+                if NLCRE.match(line):
+                    blankseen = 1
+                    continue
+                else:
+                    blankseen = 0
+            m = matchre.match(line)
+            if (m and not afterblank) or (m and afterblank and blankseen):
+                if includematch:
+                    prematch.append(line)
+                yield EMPTYSTRING.join(prematch), m
+            prematch.append(line)
+
+
+    NLatend = re.compile('(\r\n|\r|\n)$').match
+    NLCRE_crack = re.compile('(\r\n|\r|\n)')
+
+    def push(self, data):
+        """ Push some new data into this object """
+        # Handle any previous leftovers
+        data, self._partial = self._partial+data, ''
+        # Crack into lines, but leave the newlines on the end of each
+        lines = self.NLCRE_crack.split(data)
+        # The *ahem* interesting behaviour of re.split when supplied
+        # groups means that the last element is the data after the 
+        # final RE. In the case of a NL/CR terminated string, this is
+        # the empty string.
+        self._partial = lines.pop()
+        o = []
+        for i in range(len(lines) / 2):
+            o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
+        self.pushlines(o)
+    
+    def pushlines(self, lines):
+        """ Push a list of new lines into the object """
+        # Reverse and insert at the front of _pending
+        self._pending[:0] = lines[::-1]
+
+    def end(self):
+        """ There is no more data """
+        self._done = True
+
+    def is_done(self):
+        return self._done
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        l = self.readline()
+        if l == '': 
+            raise StopIteration
+        return l
+
+class FeedParser:
+    "A feed-style parser of email. copy docstring here"
+
+    def __init__(self, _class=Message.Message):
+        "fnord fnord fnord"
+        self._class = _class
+        self._input = FeedableLumpOfText()
+        self._root = None
+        self._objectstack = []
+        self._parse = self._parsegen().next
+
+    def end(self):
+        self._input.end()
+        self._call_parse()
+        return self._root
+
+    def feed(self, data):
+        self._input.push(data)
+        self._call_parse()
+
+    def _call_parse(self):
+        try:
+            self._parse()
+        except StopIteration:
+            pass
+
+    headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
+
+    def _parse_headers(self,headerlist):
+        # Passed a list of strings that are the headers for the 
+        # current object
+        lastheader = ''
+        lastvalue = []
+
+
+        for lineno, line in enumerate(headerlist):
+            # Check for continuation
+            if line[0] in ' \t':
+                if not lastheader:
+                    raise Errors.HeaderParseError('First line must not be a continuation')
+                lastvalue.append(line)
+                continue
+
+            if lastheader:
+                # XXX reconsider the joining of folded lines
+                self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
+                lastheader, lastvalue = '', []
+
+            # Check for Unix-From
+            if line.startswith('From '):
+                if lineno == 0:
+                    self._cur.set_unixfrom(line)
+                    continue
+                elif lineno == len(headerlist) - 1:
+                    # Something looking like a unix-from at the end - it's
+                    # probably the first line of the body
+                    self._input.unreadline(line)
+                    return
+                else:
+                    # Weirdly placed unix-from line. Ignore it.
+                    continue
+
+            i = line.find(':')
+            if i < 0:
+                # The older parser had various special-cases here. We've
+                # already handled them
+                raise Errors.HeaderParseError(
+                       "Not a header, not a continuation: ``%s''" % line)
+            lastheader = line[:i]
+            lastvalue = [line[i+1:].lstrip()]
+
+        if lastheader:
+            # XXX reconsider the joining of folded lines
+            self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
+
+
+    def _parsegen(self):
+        # Parse any currently available text
+        self._new_sub_object()
+        self._root = self._cur
+        completing = False
+        last = None
+        
+        for line in self._input:
+            if line is NeedMoreData:
+                yield None # Need More Data
+                continue
+            self._input.unreadline(line)
+            if not completing:
+                headers = []
+                # Now collect all headers.
+                for line in self._input:
+                    if line is NeedMoreData:
+                        yield None # Need More Data
+                        continue
+                    if not self.headerRE.match(line):
+                        self._parse_headers(headers)
+                        # A message/rfc822 has no body and no internal 
+                        # boundary.
+                        if self._cur.get_content_maintype() == "message":
+                            self._new_sub_object()
+                            completing = False
+                            headers = []
+                            continue
+                        if line.strip():
+                            # No blank line between headers and body. 
+                            # Push this line back, it's the first line of 
+                            # the body.
+                            self._input.unreadline(line)
+                        break
+                    else:
+                        headers.append(line)
+                else:
+                    # We're done with the data and are still inside the headers
+                    self._parse_headers(headers)
+
+            # Now we're dealing with the body
+            boundary = self._cur.get_boundary()
+            isdigest = (self._cur.get_content_type() == 'multipart/digest')
+            if boundary and not self._cur._finishing:
+                separator = '--' + boundary
+                self._cur._boundaryRE = re.compile(
+                        r'(?P<sep>' + re.escape(separator) +
+                        r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
+                for r in self._input.readuntil(self._cur._boundaryRE):
+                    if r is NeedMoreData:
+                         yield NeedMoreData
+                    else:
+                        preamble, matchobj = r
+                        break
+                if not matchobj:
+                    # Broken - we hit the end of file. Just set the body 
+                    # to the text.
+                    if completing:
+                        self._attach_trailer(last, preamble)
+                    else:
+                        self._attach_preamble(self._cur, preamble)
+                    # XXX move back to the parent container.
+                    self._pop_container()
+                    completing = True
+                    continue
+                if preamble:
+                    if completing:
+                        preamble = preamble[:-len(matchobj.group('linesep'))]
+                        self._attach_trailer(last, preamble)
+                    else:
+                        self._attach_preamble(self._cur, preamble)
+                elif not completing:
+                    # The module docs specify an empty preamble is None, not ''
+                    self._cur.preamble = None
+                    # If we _are_ completing, the last object gets no payload
+
+                if matchobj.group('end'):
+                    # That was the end boundary tag. Bounce back to the
+                    # parent container
+                    last = self._pop_container()
+                    self._input.unreadline(matchobj.group('linesep'))
+                    completing = True
+                    continue
+
+                # A number of MTAs produced by a nameless large company
+                # we shall call "SicroMoft" produce repeated boundary 
+                # lines.
+                while True:
+                    line = self._input.peekline()
+                    if line is NeedMoreData:
+                        yield None
+                        continue
+                    if self._cur._boundaryRE.match(line):
+                        self._input.readline()
+                    else:
+                        break
+
+                self._new_sub_object()
+                
+                completing = False
+                if isdigest:
+                    self._cur.set_default_type('message/rfc822')
+                    continue
+            else:
+                # non-multipart or after end-boundary
+                if last is not self._root:
+                    last = self._pop_container()
+                if self._cur.get_content_maintype() == "message":
+                    # We double-pop to leave the RFC822 object
+                    self._pop_container()
+                    completing = True
+                elif self._cur._boundaryRE and last <> self._root:
+                    completing = True
+                else:
+                    # Non-multipart top level, or in the trailer of the 
+                    # top level multipart
+                    while not self._input.is_done():
+                        yield None
+                    data = list(self._input)
+                    body = EMPTYSTRING.join(data)
+                    self._attach_trailer(last, body)
+
+
+    def _attach_trailer(self, obj, trailer):
+        #import pdb ; pdb.set_trace()
+        if obj.get_content_maintype() in ( "multipart", "message" ):
+            obj.epilogue = trailer
+        else:
+            obj.set_payload(trailer)
+
+    def _attach_preamble(self, obj, trailer):
+        if obj.get_content_maintype() in ( "multipart", "message" ):
+            obj.preamble = trailer
+        else:
+            obj.set_payload(trailer)
+
+
+    def _new_sub_object(self):
+        new = self._class()
+        #print "pushing", self._objectstack, repr(new)
+        if self._objectstack:
+            self._objectstack[-1].attach(new)
+        self._objectstack.append(new)
+        new._boundaryRE = None
+        new._finishing = False
+        self._cur = new
+
+    def _pop_container(self):
+        # Move the pointer to the container of the current object.
+        # Returns the (old) current object
+        #import pdb ; pdb.set_trace()
+        #print "popping", self._objectstack
+        last = self._objectstack.pop()
+        if self._objectstack:
+            self._cur = self._objectstack[-1]
+        else:
+            self._cur._finishing = True
+        return last
+
+
author	Anthony Baxter <anthonybaxter@gmail.com>
	Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)
committer	Anthony Baxter <anthonybaxter@gmail.com>
	Mon, 22 Mar 2004 00:33:28 +0000 (00:33 +0000)