import oursql, quopri import re print "=== Enron email sanitizer v0.1" print "===============================" print # build up some regular expressions for processing this text striptags_re = re.compile("\<[^>]+\>") stripbrackets_re = re.compile("\[[^]]+\]") stripcommonfiles_re = re.compile("""[a-zA-Z0-9_-]+\.(png|jpeg|jpg|pst|doc|docx|xls|xlsx|avi|wav|mp3)""") strippunct_re = re.compile("""[^a-zA-Z]+""") # some common multiwords that should be one word multiwords = { "we d": "we'd" "he d": "he'd" } # sanitizes emails from the enron corpus in-place conn = oursql.connect(host='127.0.0.1', user='xxx', db='xxx') with conn.cursor(oursql.DictCursor) as cursor: cursor.execute("""select * from message_candidates order by rand() limit 15""") for row in cursor: newbody = row['body'].partition("-----")[0].lower() newbody = quopri.decodestring(newbody).replace("=", "") newbody = striptags_re.sub('', newbody) newbody = stripbrackets_re.sub('', newbody) newbody = stripcommonfiles_re.sub('', newbody) # newbody = strippunct_re.sub(' ', newbody) print newbody print print # insert sanitized version into new column in same table print