Batch querying every 10 min and processing new messages

This commit is contained in:
IamTheFij 2018-02-08 09:36:09 -08:00
parent 814f57a2e4
commit 563e75f2b6
2 changed files with 61 additions and 15 deletions

View File

@ -2,10 +2,13 @@ from datetime import date
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from getpass import getpass from getpass import getpass
from time import sleep
import email import email
import json import json
import os import os
from dateutil import parser
from dateutil.tz import tzutc
from imbox import Imbox from imbox import Imbox
import requests import requests
@ -23,6 +26,7 @@ class MailCrawler(object):
self.imap_pass = os.environ['IMAP_PASS'] self.imap_pass = os.environ['IMAP_PASS']
def get_parsers(self): def get_parsers(self):
"""Retrieves a list of parser hosts"""
if self.parser_hosts is None: if self.parser_hosts is None:
self.parser_hosts = [] self.parser_hosts = []
parser_format = 'PARSER_{}' parser_format = 'PARSER_{}'
@ -36,6 +40,7 @@ class MailCrawler(object):
return self.parser_hosts return self.parser_hosts
def parse_message(self, message): def parse_message(self, message):
"""Parses tokens from an email message"""
text = self.get_email_text(message) text = self.get_email_text(message)
if not text: if not text:
print('No email text returned') print('No email text returned')
@ -57,6 +62,7 @@ class MailCrawler(object):
return results return results
def get_server(self): def get_server(self):
"""Returns an active IMAP server"""
return Imbox( return Imbox(
self.imap_url, self.imap_url,
username=self.imap_user, username=self.imap_user,
@ -65,13 +71,15 @@ class MailCrawler(object):
) )
def get_email_text(self, message): def get_email_text(self, message):
"""Retrieves the text body of an email message"""
body = message.body.get('plain') or message.body.get('html') body = message.body.get('plain') or message.body.get('html')
if not body: if not body:
return None return None
# Concat all known body content together since it doesn't really matter # Concat all known body content together since it doesn't really matter
return ''.join([text for text in body if isinstance(text, str)]) return ''.join([text for text in body if isinstance(text, str)])
def index_message(self, message): def index_token(self, message):
"""Sends a token from the parser to the indexer"""
response = requests.post( response = requests.post(
self.indexer_host+'/token', self.indexer_host+'/token',
json=message, json=message,
@ -79,24 +87,61 @@ class MailCrawler(object):
response.raise_for_status() response.raise_for_status()
return response.json() return response.json()
def run(self): def process_message(self, message):
print('Starting crawler') """Process a single email message"""
for result in self.parse_message(message):
result.update({
'subject': message.subject,
})
print('Parsed result: ', result)
print('Indexed result: ', self.index_token(result))
with self.get_server() as server:
since_date = datetime.now() - timedelta(days=30) def process_messages(self, server, since_date, last_message=0):
for uid, message in server.messages(date__gt=since_date): for uid, message in server.messages(date__gt=since_date):
uid = int(uid)
if uid <= last_message:
print('DDB Already seen message with uid {}. Skipping'.format(uid))
continue
print( print(
'Processing message uid {} message_id {} ' 'Processing message uid {} message_id {} '
'with subject "{}"'.format( 'with subject "{}"'.format(
uid, message.message_id, message.subject uid, message.message_id, message.subject
) )
) )
for result in self.parse_message(message): self.process_message(message)
result.update({
'subject': message.subject, # Update since_date
}) message_date = parser.parse(message.date)
print('Parsed result: ', result) print('DDB Processed message. Message date: {} Old date: {}'.format(
print('Indexed result: ', self.index_message(result)) message_date, since_date
))
since_date = max(since_date, message_date)
print('DDB Since date is now ', since_date)
last_message = max(uid, last_message)
return since_date, last_message
def run(self):
print('Starting crawler')
# TODO: Put server into some kind of context manager and property
with self.get_server() as server:
# TODO: parameterize startup date, maybe relative
since_date = datetime.now(tzutc()) - timedelta(days=1)
last_message = 0
while True:
print('Lets process')
since_date, last_message = self.process_messages(
server,
since_date,
last_message=last_message
)
print('DDB Processed all. New since_date', since_date)
# TODO: parameterize sleep
# Sleep for 10 min
sleep(10 * 60)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,2 +1,3 @@
python-dateutil
imbox imbox
requests requests