email-assistant/crawler/crawler/main.py

218 lines
7.2 KiB
Python
Raw Normal View History

import logging
import os
from argparse import ArgumentParser
2018-02-07 22:36:00 +00:00
from datetime import datetime
from datetime import timedelta
2018-02-08 23:00:17 +00:00
from imaplib import IMAP4
from time import sleep
import requests
from dateutil import parser
from dateutil.tz import tzutc
2018-02-07 22:36:00 +00:00
from imbox import Imbox
logging.basicConfig(
2019-11-12 00:26:25 +00:00
level=logging.WARNING, format="%(asctime)s %(levelname)s %(name)s %(message)s"
)
logging.getLogger(__name__).addHandler(logging.NullHandler())
2019-11-12 00:26:25 +00:00
VALID_CONTENT_TYPES = ["text/plain", "text/html"]
2019-11-11 23:53:18 +00:00
def get_message_subject(message):
"""Returns message subject or a placeholder text"""
2019-11-12 00:26:25 +00:00
return getattr(message, "subject", "NO SUBJECT")
class MailCrawler(object):
def __init__(self):
self._logger = logging.getLogger(self.__class__.__name__)
2019-11-12 00:26:25 +00:00
self.imap_url = os.environ["IMAP_URL"]
self.imap_user = os.environ["IMAP_USER"]
self.imap_pass = os.environ["IMAP_PASS"]
self.parser_hosts = None
2019-11-12 00:26:25 +00:00
self.indexer_host = os.environ.get("INDEXER")
self.debug_mode = os.environ.get("DEBUG", False)
def get_parsers(self):
"""Retrieves a list of parser hosts"""
if self.parser_hosts is None:
self.parser_hosts = []
2019-11-12 00:26:25 +00:00
parser_format = "PARSER_{}"
parser_index = 1
parser_host = os.environ.get(parser_format.format(parser_index))
while parser_host is not None:
self.parser_hosts.append(parser_host)
parser_index += 1
parser_host = os.environ.get(parser_format.format(parser_index))
return self.parser_hosts
def parse_message(self, message):
"""Parses tokens from an email message"""
text = self.get_email_text(message)
if not text:
2019-11-12 00:26:25 +00:00
print("No email text returned")
return []
results = []
for parser_host in self.get_parsers():
# print('Parsing email text... ', text)
response = requests.post(
2019-11-12 00:26:25 +00:00
parser_host + "/parse",
json={
2019-11-12 00:26:25 +00:00
"subject": get_message_subject(message),
"message": text,
},
)
response.raise_for_status()
2019-11-12 00:26:25 +00:00
print("Got response", response.text)
results += response.json()
return results
def get_server(self):
"""Returns an active IMAP server"""
2018-02-07 22:36:00 +00:00
return Imbox(
self.imap_url,
username=self.imap_user,
password=self.imap_pass,
ssl=True,
)
def get_email_text(self, message):
"""Retrieves the text body of an email message"""
2019-11-12 00:26:25 +00:00
body = message.body.get("plain") or message.body.get("html")
2018-02-07 22:36:00 +00:00
if not body:
return None
# Concat all known body content together since it doesn't really matter
2019-11-12 00:26:25 +00:00
return "".join([text for text in body if isinstance(text, str)])
def index_token(self, message):
"""Sends a token from the parser to the indexer"""
if self.indexer_host is None and self.debug_mode:
print("DDB No indexer host, but OK for debugging")
2018-02-03 08:21:31 +00:00
response = requests.post(
2019-11-12 00:26:25 +00:00
self.indexer_host + "/token",
2018-02-03 08:21:31 +00:00
json=message,
)
response.raise_for_status()
return response.json()
def process_message(self, message):
"""Process a single email message"""
for result in self.parse_message(message):
2019-11-12 00:26:25 +00:00
result.update(
{
"subject": message.subject,
}
)
print("Parsed result: ", result)
print("Indexed result: ", self.index_token(result))
def process_messages(self, server, since_date, last_uid=0):
kwargs = {}
if last_uid > 0:
kwargs["uid__range"] = "{}:*".format(last_uid)
else:
kwargs["date__gt"] = since_date
self._logger.info("Mailbox search kwargs %s", kwargs)
for uid, message in server.messages(**kwargs):
uid = int(uid)
if uid <= last_uid:
self._logger.debug(
"DDB Already seen message with uid {}. Skipping".format(uid)
2019-11-11 23:53:18 +00:00
)
continue
self._logger.info(
"Processing message uid %s message_id %s with subject '%s'",
uid,
message.message_id,
get_message_subject(message),
)
self.process_message(message)
# Update since_date
message_date = parser.parse(message.date)
self._logger.debug(
"DDB Processed message. Message date: %s Old date: %s",
2019-11-12 00:26:25 +00:00
message_date,
since_date,
)
try:
since_date = max(since_date, message_date)
except TypeError:
2019-11-12 00:26:25 +00:00
self._logger.error("Error comparing dates. We'll just use the last one")
self._logger.debug("DDB Since date is now %s", since_date)
last_uid = max(uid, last_uid)
return since_date, last_uid
def _parse_args(self, args=None):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description="Inbox crawler")
parser.add_argument(
2019-11-12 00:26:25 +00:00
"--sleep",
"-s",
default=10 * 60,
help=(
"Number of seconds to wait between polling IMAP server."
"Default 10 min"
),
)
parser.add_argument(
2019-11-12 00:26:25 +00:00
"--verbosity",
"-v",
action="count",
2019-11-12 00:26:25 +00:00
help=(
"Adjust log verbosity by increasing arg count. Default log",
"level is ERROR. Level increases with each `v`",
),
)
return parser.parse_args(args)
def _set_log_level(self, verbosity):
"""Sets the log level for the class using the provided verbosity count"""
if verbosity == 1:
# Set this logger to info
self._logger.setLevel(logging.INFO)
elif verbosity == 2:
# Set this logger to debug and root to info
logging.getLogger().setLevel(logging.INFO)
self._logger.setLevel(logging.DEBUG)
elif verbosity >= 3:
# Set the root logger to debug
logging.getLogger().setLevel(logging.DEBUG)
def run(self, args=None):
args = self._parse_args(args=args)
if args.verbosity:
self._set_log_level(args.verbosity)
2019-11-12 00:26:25 +00:00
self._logger.info("Starting crawler")
2018-02-07 22:36:00 +00:00
with self.get_server() as server:
# TODO: parameterize startup date, maybe relative
since_date = datetime.now(tzutc()) - timedelta(days=16)
last_uid = 0
while True:
print("Processing messages")
since_date, last_uid = self.process_messages(
2019-11-12 00:26:25 +00:00
server, since_date, last_uid=last_uid
)
self._logger.info(
"DDB Processed all. New since_date %s",
since_date,
2018-02-07 22:36:00 +00:00
)
sleep(args.sleep)
2019-11-12 00:26:25 +00:00
if __name__ == "__main__":
2018-02-08 23:00:17 +00:00
while True:
try:
MailCrawler().run()
except IMAP4.abort:
2019-11-12 00:26:25 +00:00
print("Imap abort. We will try to reconnect")
2018-02-08 23:00:17 +00:00
pass