email-assistant/crawler/crawler/main.py

104 lines
3.1 KiB
Python

from datetime import date
from datetime import datetime
from datetime import timedelta
from getpass import getpass
import email
import json
import os
from imbox import Imbox
import requests
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
class MailCrawler(object):
parser_hosts = None
indexer_host = os.environ['INDEXER']
def __init__(self):
self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS']
def get_parsers(self):
if self.parser_hosts is None:
self.parser_hosts = []
parser_format = 'PARSER_{}'
parser_index = 1
parser_host = os.environ.get(parser_format.format(parser_index))
while parser_host is not None:
self.parser_hosts.append(parser_host)
parser_index += 1
parser_host = os.environ.get(parser_format.format(parser_index))
return self.parser_hosts
def parse_message(self, message):
text = self.get_email_text(message)
if not text:
print('No email text returned')
return []
results = []
for parser_host in self.get_parsers():
# print('Parsing email text... ', text)
response = requests.post(
parser_host+'/parse',
json={
'subject': message.subject,
'message': text,
},
)
response.raise_for_status()
print('Got response', response.text)
results += response.json()
return results
def get_server(self):
return Imbox(
self.imap_url,
username=self.imap_user,
password=self.imap_pass,
ssl=True,
)
def get_email_text(self, message):
body = message.body.get('plain') or message.body.get('html')
if not body:
return None
# Concat all known body content together since it doesn't really matter
return ''.join([text for text in body if isinstance(text, str)])
def index_message(self, message):
response = requests.post(
self.indexer_host+'/token',
json=message,
)
response.raise_for_status()
return response.json()
def run(self):
print('Starting crawler')
with self.get_server() as server:
since_date = datetime.now() - timedelta(days=30)
for uid, message in server.messages(date__gt=since_date):
print(
'Processing message uid {} message_id {} '
'with subject "{}"'.format(
uid, message.message_id, message.subject
)
)
for result in self.parse_message(message):
result.update({
'subject': message.subject,
})
print('Parsed result: ', result)
print('Indexed result: ', self.index_message(result))
if __name__ == '__main__':
MailCrawler().run()