Batch querying every 10 min and processing new messages

This commit is contained in:
IamTheFij 2018-02-08 09:36:09 -08:00
parent 814f57a2e4
commit 563e75f2b6
2 changed files with 61 additions and 15 deletions

View File

@ -2,10 +2,13 @@ from datetime import date
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from getpass import getpass from getpass import getpass
from time import sleep
import email import email
import json import json
import os import os
from dateutil import parser
from dateutil.tz import tzutc
from imbox import Imbox from imbox import Imbox
import requests import requests
@ -23,6 +26,7 @@ class MailCrawler(object):
self.imap_pass = os.environ['IMAP_PASS'] self.imap_pass = os.environ['IMAP_PASS']
def get_parsers(self): def get_parsers(self):
"""Retrieves a list of parser hosts"""
if self.parser_hosts is None: if self.parser_hosts is None:
self.parser_hosts = [] self.parser_hosts = []
parser_format = 'PARSER_{}' parser_format = 'PARSER_{}'
@ -36,6 +40,7 @@ class MailCrawler(object):
return self.parser_hosts return self.parser_hosts
def parse_message(self, message): def parse_message(self, message):
"""Parses tokens from an email message"""
text = self.get_email_text(message) text = self.get_email_text(message)
if not text: if not text:
print('No email text returned') print('No email text returned')
@ -57,6 +62,7 @@ class MailCrawler(object):
return results return results
def get_server(self): def get_server(self):
"""Returns an active IMAP server"""
return Imbox( return Imbox(
self.imap_url, self.imap_url,
username=self.imap_user, username=self.imap_user,
@ -65,13 +71,15 @@ class MailCrawler(object):
) )
def get_email_text(self, message): def get_email_text(self, message):
"""Retrieves the text body of an email message"""
body = message.body.get('plain') or message.body.get('html') body = message.body.get('plain') or message.body.get('html')
if not body: if not body:
return None return None
# Concat all known body content together since it doesn't really matter # Concat all known body content together since it doesn't really matter
return ''.join([text for text in body if isinstance(text, str)]) return ''.join([text for text in body if isinstance(text, str)])
def index_message(self, message): def index_token(self, message):
"""Sends a token from the parser to the indexer"""
response = requests.post( response = requests.post(
self.indexer_host+'/token', self.indexer_host+'/token',
json=message, json=message,
@ -79,24 +87,61 @@ class MailCrawler(object):
response.raise_for_status() response.raise_for_status()
return response.json() return response.json()
def process_message(self, message):
"""Process a single email message"""
for result in self.parse_message(message):
result.update({
'subject': message.subject,
})
print('Parsed result: ', result)
print('Indexed result: ', self.index_token(result))
def process_messages(self, server, since_date, last_message=0):
for uid, message in server.messages(date__gt=since_date):
uid = int(uid)
if uid <= last_message:
print('DDB Already seen message with uid {}. Skipping'.format(uid))
continue
print(
'Processing message uid {} message_id {} '
'with subject "{}"'.format(
uid, message.message_id, message.subject
)
)
self.process_message(message)
# Update since_date
message_date = parser.parse(message.date)
print('DDB Processed message. Message date: {} Old date: {}'.format(
message_date, since_date
))
since_date = max(since_date, message_date)
print('DDB Since date is now ', since_date)
last_message = max(uid, last_message)
return since_date, last_message
def run(self): def run(self):
print('Starting crawler') print('Starting crawler')
# TODO: Put server into some kind of context manager and property
with self.get_server() as server: with self.get_server() as server:
since_date = datetime.now() - timedelta(days=30) # TODO: parameterize startup date, maybe relative
for uid, message in server.messages(date__gt=since_date): since_date = datetime.now(tzutc()) - timedelta(days=1)
print( last_message = 0
'Processing message uid {} message_id {} ' while True:
'with subject "{}"'.format( print('Lets process')
uid, message.message_id, message.subject since_date, last_message = self.process_messages(
) server,
since_date,
last_message=last_message
) )
for result in self.parse_message(message): print('DDB Processed all. New since_date', since_date)
result.update({ # TODO: parameterize sleep
'subject': message.subject, # Sleep for 10 min
}) sleep(10 * 60)
print('Parsed result: ', result)
print('Indexed result: ', self.index_message(result))
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,2 +1,3 @@
python-dateutil
imbox imbox
requests requests