Extracting the body of an email from mbox file, decoding it to plain text regardless of Charset and Content Transfer Encoding

Question 1

Extracting the body of an email from mbox file, decoding it to plain text regardless of Charset and Content Transfer Encoding

email python-3.x content-type plaintext mbox

DCB · Aug 23, 2011 · Viewed 13.7k times · Source

Answer

Answer

Here is some code that does the job, it prints errors instead of crashing for those messages where it would fail. I hope that it may be useful. Note that if there is a bug in Python 3, and that is fixed, then the lines .get_payload(decode=True) may then return a str object instead of a bytes object. I ran this code today on 2.7.2 and on Python 3.2.1.

import mailbox

def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():

            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True) 
                        #charset = subpart.get_charset()

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True) 

   # No checking done to match the charset with the correct part. 
    for charset in getcharsets(msg):
        try:
            body = body.decode(charset)
        except UnicodeDecodeError:
            handleerror("UnicodeDecodeError: encountered.",msg,charset)
        except AttributeError:
             handleerror("AttributeError: encountered" ,msg,charset)
    return body    


#mboxfile = 'C:/Users/Username/Documents/Thunderbird/Data/profile/ImapMail/server.name/INBOX'
print(mboxfile)
for thisemail in mailbox.mbox(mboxfile):
    body = getbodyfromemail(thisemail)
    print(body[0:1000])

Question 2

I am trying to use Python 3 to extract the body of email messages from a thunderbird mbox file. It is an IMAP account.

I would like to have the text part of the body of the email available to process as a unicode string. It should 'look like' the email does in Thunderbird, and not contain escaped characters such as \r\n =20 etc.

I think that it is the Content Transfer Encodings that I don't know how to decode or remove. I receive emails with a variety of different Content Types, and different Content Transfer Encodings. This is my current attempt :

import mailbox
import quopri,base64

def myconvert(encoded,ContentTransferEncoding):
    if ContentTransferEncoding == 'quoted-printable':
        result = quopri.decodestring(encoded)
    elif ContentTransferEncoding == 'base64':
        result = base64.b64decode(encoded)

mboxfile = 'C:/Users/Username/Documents/Thunderbird/Data/profile/ImapMail/server.name/INBOX'

for msg in mailbox.mbox(mboxfile):
    if msg.is_multipart():    #Walk through the parts of the email to find the text body.
        for part in msg.walk():
            if part.is_multipart(): # If part is multipart, walk through the subparts.
                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        body = subpart.get_payload() # Get the subpart payload (i.e the message body)
                    for k,v in subpart.items():
                            if k == 'Content-Transfer-Encoding':
                                cte = v             # Keep the Content Transfer Encoding
            elif subpart.get_content_type() == 'text/plain':
                body = part.get_payload()           # part isn't multipart Get the payload
                for k,v in part.items():
                    if k == 'Content-Transfer-Encoding':
                        cte = v                      # Keep the Content Transfer Encoding

print(body)
print('Body is of type:',type(body))
body = myconvert(body,cte)
print(body)

But this fails with :

Body is of type: <class 'str'>
Traceback (most recent call last):
File "C:/Users/David/Documents/Python/test2.py", line 31, in <module>
  body = myconvert(body,cte)
File "C:/Users/David/Documents/Python/test2.py", line 6, in myconvert
  result = quopri.decodestring(encoded)
File "C:\Python32\lib\quopri.py", line 164, in decodestring
  return a2b_qp(s, header=header)
TypeError: 'str' does not support the buffer interface

Extracting the body of an email from mbox file, decoding it to plain text regardless of Charset and Content Transfer Encoding

Answer

Related questions