2020-01-24 01:28:38 +00:00
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
from argparse import ArgumentParser
|
2018-02-07 22:36:00 +00:00
|
|
|
from datetime import datetime
|
|
|
|
from datetime import timedelta
|
2018-02-08 23:00:17 +00:00
|
|
|
from imaplib import IMAP4
|
2020-01-24 01:28:38 +00:00
|
|
|
from time import sleep
|
2018-02-03 01:40:20 +00:00
|
|
|
|
2020-01-24 01:28:38 +00:00
|
|
|
import requests
|
2018-02-08 17:36:09 +00:00
|
|
|
from dateutil import parser
|
|
|
|
from dateutil.tz import tzutc
|
2018-02-07 22:36:00 +00:00
|
|
|
from imbox import Imbox
|
2020-01-24 01:28:38 +00:00
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.WARNING,
|
|
|
|
format='%(asctime)s %(levelname)s %(name)s %(message)s'
|
|
|
|
)
|
|
|
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
2018-02-03 01:40:20 +00:00
|
|
|
|
|
|
|
|
2019-11-11 23:53:18 +00:00
|
|
|
VALID_CONTENT_TYPES = ['text/plain', 'text/html']
|
|
|
|
|
|
|
|
|
|
|
|
def get_message_subject(message):
|
|
|
|
"""Returns message subject or a placeholder text"""
|
|
|
|
return getattr(message, 'subject', 'NO SUBJECT')
|
2018-02-03 02:03:00 +00:00
|
|
|
|
|
|
|
|
2018-02-03 01:40:20 +00:00
|
|
|
class MailCrawler(object):
|
|
|
|
|
2018-02-03 02:03:00 +00:00
|
|
|
def __init__(self):
|
2020-01-24 01:28:38 +00:00
|
|
|
self._logger = logging.getLogger(self.__class__.__name__)
|
2018-02-03 02:03:00 +00:00
|
|
|
self.imap_url = os.environ['IMAP_URL']
|
|
|
|
self.imap_user = os.environ['IMAP_USER']
|
|
|
|
self.imap_pass = os.environ['IMAP_PASS']
|
2020-01-24 01:28:38 +00:00
|
|
|
self.parser_hosts = None
|
|
|
|
self.indexer_host = os.environ.get('INDEXER')
|
|
|
|
self.debug_mode = os.environ.get('DEBUG', False)
|
2018-02-03 01:40:20 +00:00
|
|
|
|
2018-02-03 03:19:29 +00:00
|
|
|
def get_parsers(self):
|
2018-02-08 17:36:09 +00:00
|
|
|
"""Retrieves a list of parser hosts"""
|
2018-02-03 03:19:29 +00:00
|
|
|
if self.parser_hosts is None:
|
|
|
|
self.parser_hosts = []
|
|
|
|
parser_format = 'PARSER_{}'
|
|
|
|
parser_index = 1
|
|
|
|
parser_host = os.environ.get(parser_format.format(parser_index))
|
|
|
|
while parser_host is not None:
|
|
|
|
self.parser_hosts.append(parser_host)
|
|
|
|
parser_index += 1
|
|
|
|
parser_host = os.environ.get(parser_format.format(parser_index))
|
|
|
|
|
|
|
|
return self.parser_hosts
|
|
|
|
|
|
|
|
def parse_message(self, message):
|
2018-02-08 17:36:09 +00:00
|
|
|
"""Parses tokens from an email message"""
|
2018-02-03 03:19:29 +00:00
|
|
|
text = self.get_email_text(message)
|
|
|
|
if not text:
|
2018-02-06 01:53:28 +00:00
|
|
|
print('No email text returned')
|
2018-02-03 03:19:29 +00:00
|
|
|
return []
|
|
|
|
|
|
|
|
results = []
|
|
|
|
for parser_host in self.get_parsers():
|
2018-02-06 01:53:28 +00:00
|
|
|
# print('Parsing email text... ', text)
|
2018-02-03 03:19:29 +00:00
|
|
|
response = requests.post(
|
|
|
|
parser_host+'/parse',
|
2018-02-06 01:53:28 +00:00
|
|
|
json={
|
2019-11-11 23:53:18 +00:00
|
|
|
'subject': get_message_subject(message),
|
2018-02-06 01:53:28 +00:00
|
|
|
'message': text,
|
|
|
|
},
|
2018-02-03 03:19:29 +00:00
|
|
|
)
|
|
|
|
response.raise_for_status()
|
2018-02-06 01:53:28 +00:00
|
|
|
print('Got response', response.text)
|
2018-02-03 03:19:29 +00:00
|
|
|
results += response.json()
|
|
|
|
return results
|
|
|
|
|
2018-02-03 01:40:20 +00:00
|
|
|
def get_server(self):
|
2018-02-08 17:36:09 +00:00
|
|
|
"""Returns an active IMAP server"""
|
2018-02-07 22:36:00 +00:00
|
|
|
return Imbox(
|
|
|
|
self.imap_url,
|
|
|
|
username=self.imap_user,
|
|
|
|
password=self.imap_pass,
|
|
|
|
ssl=True,
|
|
|
|
)
|
2018-02-03 01:40:20 +00:00
|
|
|
|
|
|
|
def get_email_text(self, message):
|
2018-02-08 17:36:09 +00:00
|
|
|
"""Retrieves the text body of an email message"""
|
2018-02-07 22:36:00 +00:00
|
|
|
body = message.body.get('plain') or message.body.get('html')
|
|
|
|
if not body:
|
|
|
|
return None
|
|
|
|
# Concat all known body content together since it doesn't really matter
|
|
|
|
return ''.join([text for text in body if isinstance(text, str)])
|
2018-02-03 01:40:20 +00:00
|
|
|
|
2018-02-08 17:36:09 +00:00
|
|
|
def index_token(self, message):
|
|
|
|
"""Sends a token from the parser to the indexer"""
|
2020-01-24 01:28:38 +00:00
|
|
|
if self.indexer_host is None and self.debug_mode:
|
|
|
|
print("DDB No indexer host, but OK for debugging")
|
2018-02-03 08:21:31 +00:00
|
|
|
response = requests.post(
|
|
|
|
self.indexer_host+'/token',
|
|
|
|
json=message,
|
|
|
|
)
|
|
|
|
response.raise_for_status()
|
|
|
|
return response.json()
|
|
|
|
|
2018-02-08 17:36:09 +00:00
|
|
|
def process_message(self, message):
|
|
|
|
"""Process a single email message"""
|
|
|
|
for result in self.parse_message(message):
|
|
|
|
result.update({
|
2020-01-24 01:28:38 +00:00
|
|
|
"subject": message.subject,
|
2018-02-08 17:36:09 +00:00
|
|
|
})
|
2020-01-24 01:28:38 +00:00
|
|
|
print("Parsed result: ", result)
|
|
|
|
print("Indexed result: ", self.index_token(result))
|
|
|
|
|
|
|
|
def process_messages(self, server, since_date, last_uid=0):
|
|
|
|
kwargs = {}
|
|
|
|
if last_uid > 0:
|
|
|
|
kwargs["uid__range"] = "{}:*".format(last_uid)
|
|
|
|
else:
|
|
|
|
kwargs["date__gt"] = since_date
|
|
|
|
self._logger.info("Mailbox search kwargs %s", kwargs)
|
|
|
|
for uid, message in server.messages(**kwargs):
|
2018-02-08 17:36:09 +00:00
|
|
|
uid = int(uid)
|
2020-01-24 01:28:38 +00:00
|
|
|
if uid <= last_uid:
|
|
|
|
self._logger.debug(
|
|
|
|
"DDB Already seen message with uid {}. Skipping".format(uid)
|
2019-11-11 23:53:18 +00:00
|
|
|
)
|
2018-02-08 17:36:09 +00:00
|
|
|
continue
|
|
|
|
|
2020-01-24 01:28:38 +00:00
|
|
|
self._logger.info(
|
|
|
|
"Processing message uid %s message_id %s with subject '%s'",
|
|
|
|
uid,
|
|
|
|
message.message_id,
|
|
|
|
get_message_subject(message),
|
2018-02-08 17:36:09 +00:00
|
|
|
)
|
|
|
|
self.process_message(message)
|
|
|
|
|
|
|
|
# Update since_date
|
|
|
|
message_date = parser.parse(message.date)
|
2020-01-24 01:28:38 +00:00
|
|
|
self._logger.debug(
|
|
|
|
"DDB Processed message. Message date: %s Old date: %s",
|
2018-02-08 17:36:09 +00:00
|
|
|
message_date, since_date
|
2020-01-24 01:28:38 +00:00
|
|
|
)
|
2019-12-25 16:26:53 +00:00
|
|
|
try:
|
|
|
|
since_date = max(since_date, message_date)
|
|
|
|
except TypeError:
|
2020-01-24 01:28:38 +00:00
|
|
|
self._logger.error(
|
|
|
|
"Error comparing dates. We'll just use the last one"
|
|
|
|
)
|
|
|
|
self._logger.debug("DDB Since date is now %s", since_date)
|
|
|
|
last_uid = max(uid, last_uid)
|
|
|
|
|
|
|
|
return since_date, last_uid
|
|
|
|
|
|
|
|
def _parse_args(self, args=None):
|
|
|
|
"""Parses command line arguments and returns them"""
|
|
|
|
parser = ArgumentParser(description="Inbox crawler")
|
|
|
|
parser.add_argument(
|
|
|
|
"--sleep", "-s",
|
|
|
|
default=10*60,
|
|
|
|
help=("Number of seconds to wait between polling IMAP server."
|
|
|
|
"Default 10 min"),
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--verbosity", "-v",
|
|
|
|
action="count",
|
|
|
|
help=("Adjust log verbosity by increasing arg count. Default log",
|
|
|
|
"level is ERROR. Level increases with each `v`"),
|
|
|
|
)
|
|
|
|
return parser.parse_args(args)
|
|
|
|
|
|
|
|
def _set_log_level(self, verbosity):
|
|
|
|
"""Sets the log level for the class using the provided verbosity count"""
|
|
|
|
if verbosity == 1:
|
|
|
|
# Set this logger to info
|
|
|
|
self._logger.setLevel(logging.INFO)
|
|
|
|
elif verbosity == 2:
|
|
|
|
# Set this logger to debug and root to info
|
|
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
|
|
self._logger.setLevel(logging.DEBUG)
|
|
|
|
elif verbosity >= 3:
|
|
|
|
# Set the root logger to debug
|
|
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
def run(self, args=None):
|
|
|
|
args = self._parse_args(args=args)
|
|
|
|
if args.verbosity:
|
|
|
|
self._set_log_level(args.verbosity)
|
|
|
|
|
|
|
|
self._logger.info('Starting crawler')
|
2018-02-07 22:36:00 +00:00
|
|
|
with self.get_server() as server:
|
2018-02-08 17:36:09 +00:00
|
|
|
# TODO: parameterize startup date, maybe relative
|
2020-01-24 01:28:38 +00:00
|
|
|
since_date = datetime.now(tzutc()) - timedelta(days=16)
|
|
|
|
last_uid = 0
|
2018-02-08 17:36:09 +00:00
|
|
|
while True:
|
2020-01-24 01:28:38 +00:00
|
|
|
print("Processing messages")
|
|
|
|
since_date, last_uid = self.process_messages(
|
2018-02-08 17:36:09 +00:00
|
|
|
server,
|
|
|
|
since_date,
|
2020-01-24 01:28:38 +00:00
|
|
|
last_uid=last_uid
|
|
|
|
)
|
|
|
|
self._logger.info(
|
|
|
|
"DDB Processed all. New since_date %s",
|
|
|
|
since_date,
|
2018-02-07 22:36:00 +00:00
|
|
|
)
|
2020-01-24 01:28:38 +00:00
|
|
|
sleep(args.sleep)
|
2018-02-03 01:40:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2018-02-08 23:00:17 +00:00
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
MailCrawler().run()
|
|
|
|
except IMAP4.abort:
|
|
|
|
print('Imap abort. We will try to reconnect')
|
|
|
|
pass
|