Use imbox instead

This commit is contained in:
IamTheFij 2018-02-07 14:36:00 -08:00
parent 859f0e040e
commit 814f57a2e4
2 changed files with 34 additions and 41 deletions

View File

@ -1,10 +1,12 @@
from getpass import getpass
from datetime import date from datetime import date
import json from datetime import datetime
from datetime import timedelta
from getpass import getpass
import email import email
import json
import os import os
from imapclient import IMAPClient from imbox import Imbox
import requests import requests
@ -45,7 +47,7 @@ class MailCrawler(object):
response = requests.post( response = requests.post(
parser_host+'/parse', parser_host+'/parse',
json={ json={
'subject': message['SUBJECT'], 'subject': message.subject,
'message': text, 'message': text,
}, },
) )
@ -55,32 +57,19 @@ class MailCrawler(object):
return results return results
def get_server(self): def get_server(self):
server = IMAPClient(self.imap_url, use_uid=True) return Imbox(
server.login(self.imap_user, self.imap_pass) self.imap_url,
return server username=self.imap_user,
password=self.imap_pass,
def is_valid_content_type(self, message): ssl=True,
return message.get_content_type() in VALID_CONTENT_TYPES )
def get_email_text(self, message): def get_email_text(self, message):
if not message.is_multipart(): body = message.body.get('plain') or message.body.get('html')
if self.is_valid_content_type(message): if not body:
# TODO: Check encoding (maybe CHARSET)
try:
return message.get_payload(decode=True).decode('utf-8')
except UnicodeDecodeError:
print('Error decoding')
return None
else:
content_type_to_payload = {
payload.get_content_type(): self.get_email_text(payload)
for payload in message.get_payload()
}
for content_type in VALID_CONTENT_TYPES:
text = content_type_to_payload.get(content_type)
if text:
return text
return None return None
# Concat all known body content together since it doesn't really matter
return ''.join([text for text in body if isinstance(text, str)])
def index_message(self, message): def index_message(self, message):
response = requests.post( response = requests.post(
@ -92,15 +81,19 @@ class MailCrawler(object):
def run(self): def run(self):
print('Starting crawler') print('Starting crawler')
server = self.get_server()
server.select_folder('INBOX') with self.get_server() as server:
message_ids = server.search(['SINCE', date(2018, 1, 31)]) since_date = datetime.now() - timedelta(days=30)
for msgid, data in server.fetch(message_ids, 'RFC822').items(): for uid, message in server.messages(date__gt=since_date):
print('Fetched message with id ', msgid) print(
email_message = email.message_from_bytes(data[b'RFC822']) 'Processing message uid {} message_id {} '
for result in self.parse_message(email_message): 'with subject "{}"'.format(
uid, message.message_id, message.subject
)
)
for result in self.parse_message(message):
result.update({ result.update({
'subject': email_message['SUBJECT'], 'subject': message.subject,
}) })
print('Parsed result: ', result) print('Parsed result: ', result)
print('Indexed result: ', self.index_message(result)) print('Indexed result: ', self.index_message(result))

View File

@ -1,2 +1,2 @@
imapclient imbox
requests requests