Use imbox instead

This commit is contained in:
IamTheFij 2018-02-07 14:36:00 -08:00
parent 859f0e040e
commit 814f57a2e4
2 changed files with 34 additions and 41 deletions

View File

@ -1,10 +1,12 @@
from getpass import getpass
from datetime import date from datetime import date
import json from datetime import datetime
from datetime import timedelta
from getpass import getpass
import email import email
import json
import os import os
from imapclient import IMAPClient from imbox import Imbox
import requests import requests
@ -45,7 +47,7 @@ class MailCrawler(object):
response = requests.post( response = requests.post(
parser_host+'/parse', parser_host+'/parse',
json={ json={
'subject': message['SUBJECT'], 'subject': message.subject,
'message': text, 'message': text,
}, },
) )
@ -55,32 +57,19 @@ class MailCrawler(object):
return results return results
def get_server(self): def get_server(self):
server = IMAPClient(self.imap_url, use_uid=True) return Imbox(
server.login(self.imap_user, self.imap_pass) self.imap_url,
return server username=self.imap_user,
password=self.imap_pass,
def is_valid_content_type(self, message): ssl=True,
return message.get_content_type() in VALID_CONTENT_TYPES )
def get_email_text(self, message): def get_email_text(self, message):
if not message.is_multipart(): body = message.body.get('plain') or message.body.get('html')
if self.is_valid_content_type(message): if not body:
# TODO: Check encoding (maybe CHARSET) return None
try: # Concat all known body content together since it doesn't really matter
return message.get_payload(decode=True).decode('utf-8') return ''.join([text for text in body if isinstance(text, str)])
except UnicodeDecodeError:
print('Error decoding')
return None
else:
content_type_to_payload = {
payload.get_content_type(): self.get_email_text(payload)
for payload in message.get_payload()
}
for content_type in VALID_CONTENT_TYPES:
text = content_type_to_payload.get(content_type)
if text:
return text
return None
def index_message(self, message): def index_message(self, message):
response = requests.post( response = requests.post(
@ -92,18 +81,22 @@ class MailCrawler(object):
def run(self): def run(self):
print('Starting crawler') print('Starting crawler')
server = self.get_server()
server.select_folder('INBOX') with self.get_server() as server:
message_ids = server.search(['SINCE', date(2018, 1, 31)]) since_date = datetime.now() - timedelta(days=30)
for msgid, data in server.fetch(message_ids, 'RFC822').items(): for uid, message in server.messages(date__gt=since_date):
print('Fetched message with id ', msgid) print(
email_message = email.message_from_bytes(data[b'RFC822']) 'Processing message uid {} message_id {} '
for result in self.parse_message(email_message): 'with subject "{}"'.format(
result.update({ uid, message.message_id, message.subject
'subject': email_message['SUBJECT'], )
}) )
print('Parsed result: ', result) for result in self.parse_message(message):
print('Indexed result: ', self.index_message(result)) result.update({
'subject': message.subject,
})
print('Parsed result: ', result)
print('Indexed result: ', self.index_message(result))
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,2 +1,2 @@
imapclient imbox
requests requests