Update crawler to use new version and avoid redundant fetching

This commit is contained in:
IamTheFij 2020-01-24 01:28:38 +00:00
parent 854b090145
commit 2947220b21
4 changed files with 96 additions and 40 deletions

View File

@ -1,3 +1,3 @@
FROM python:3.6-onbuild FROM python:3.6-onbuild
CMD python -m crawler.main CMD python -m crawler.main -v

View File

@ -1,13 +1,22 @@
import logging
import os
from argparse import ArgumentParser
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from time import sleep
from imaplib import IMAP4 from imaplib import IMAP4
import os from time import sleep
import requests
from dateutil import parser from dateutil import parser
from dateutil.tz import tzutc from dateutil.tz import tzutc
from imbox import Imbox from imbox import Imbox
import requests
logging.basicConfig(
level=logging.WARNING,
format='%(asctime)s %(levelname)s %(name)s %(message)s'
)
logging.getLogger(__name__).addHandler(logging.NullHandler())
VALID_CONTENT_TYPES = ['text/plain', 'text/html'] VALID_CONTENT_TYPES = ['text/plain', 'text/html']
@ -19,13 +28,15 @@ def get_message_subject(message):
class MailCrawler(object): class MailCrawler(object):
parser_hosts = None
indexer_host = os.environ['INDEXER']
def __init__(self): def __init__(self):
self._logger = logging.getLogger(self.__class__.__name__)
self.imap_url = os.environ['IMAP_URL'] self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER'] self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS'] self.imap_pass = os.environ['IMAP_PASS']
self.parser_hosts = None
self.indexer_host = os.environ.get('INDEXER')
self.debug_mode = os.environ.get('DEBUG', False)
def get_parsers(self): def get_parsers(self):
"""Retrieves a list of parser hosts""" """Retrieves a list of parser hosts"""
@ -82,6 +93,8 @@ class MailCrawler(object):
def index_token(self, message): def index_token(self, message):
"""Sends a token from the parser to the indexer""" """Sends a token from the parser to the indexer"""
if self.indexer_host is None and self.debug_mode:
print("DDB No indexer host, but OK for debugging")
response = requests.post( response = requests.post(
self.indexer_host+'/token', self.indexer_host+'/token',
json=message, json=message,
@ -93,62 +106,103 @@ class MailCrawler(object):
"""Process a single email message""" """Process a single email message"""
for result in self.parse_message(message): for result in self.parse_message(message):
result.update({ result.update({
'subject': message.subject, "subject": message.subject,
}) })
print('Parsed result: ', result) print("Parsed result: ", result)
print('Indexed result: ', self.index_token(result)) print("Indexed result: ", self.index_token(result))
def process_messages(self, server, since_date, last_message=0): def process_messages(self, server, since_date, last_uid=0):
for uid, message in server.messages(date__gt=since_date): kwargs = {}
if last_uid > 0:
kwargs["uid__range"] = "{}:*".format(last_uid)
else:
kwargs["date__gt"] = since_date
self._logger.info("Mailbox search kwargs %s", kwargs)
for uid, message in server.messages(**kwargs):
uid = int(uid) uid = int(uid)
if uid <= last_message: if uid <= last_uid:
print( self._logger.debug(
'DDB Already seen message with uid {}. Skipping'.format(uid) "DDB Already seen message with uid {}. Skipping".format(uid)
) )
continue continue
print( self._logger.info(
'Processing message uid {} message_id {} ' "Processing message uid %s message_id %s with subject '%s'",
'with subject "{}"'.format(
uid, uid,
message.message_id, message.message_id,
get_message_subject(message), get_message_subject(message),
) )
)
self.process_message(message) self.process_message(message)
# Update since_date # Update since_date
message_date = parser.parse(message.date) message_date = parser.parse(message.date)
print('DDB Processed message. Message date: {} Old date: {}'.format( self._logger.debug(
"DDB Processed message. Message date: %s Old date: %s",
message_date, since_date message_date, since_date
)) )
try: try:
since_date = max(since_date, message_date) since_date = max(since_date, message_date)
except TypeError: except TypeError:
print("Error comparing dates. We'll just use the last one") self._logger.error(
print('DDB Since date is now ', since_date) "Error comparing dates. We'll just use the last one"
last_message = max(uid, last_message) )
self._logger.debug("DDB Since date is now %s", since_date)
last_uid = max(uid, last_uid)
return since_date, last_message return since_date, last_uid
def run(self): def _parse_args(self, args=None):
print('Starting crawler') """Parses command line arguments and returns them"""
# TODO: Put server into some kind of context manager and property parser = ArgumentParser(description="Inbox crawler")
parser.add_argument(
"--sleep", "-s",
default=10*60,
help=("Number of seconds to wait between polling IMAP server."
"Default 10 min"),
)
parser.add_argument(
"--verbosity", "-v",
action="count",
help=("Adjust log verbosity by increasing arg count. Default log",
"level is ERROR. Level increases with each `v`"),
)
return parser.parse_args(args)
def _set_log_level(self, verbosity):
"""Sets the log level for the class using the provided verbosity count"""
if verbosity == 1:
# Set this logger to info
self._logger.setLevel(logging.INFO)
elif verbosity == 2:
# Set this logger to debug and root to info
logging.getLogger().setLevel(logging.INFO)
self._logger.setLevel(logging.DEBUG)
elif verbosity >= 3:
# Set the root logger to debug
logging.getLogger().setLevel(logging.DEBUG)
def run(self, args=None):
args = self._parse_args(args=args)
if args.verbosity:
self._set_log_level(args.verbosity)
self._logger.info('Starting crawler')
with self.get_server() as server: with self.get_server() as server:
# TODO: parameterize startup date, maybe relative # TODO: parameterize startup date, maybe relative
since_date = datetime.now(tzutc()) - timedelta(days=15) since_date = datetime.now(tzutc()) - timedelta(days=16)
last_message = 0 last_uid = 0
while True: while True:
print('Lets process') print("Processing messages")
since_date, last_message = self.process_messages( since_date, last_uid = self.process_messages(
server, server,
since_date, since_date,
last_message=last_message last_uid=last_uid
) )
print('DDB Processed all. New since_date', since_date) self._logger.info(
# TODO: parameterize sleep "DDB Processed all. New since_date %s",
# Sleep for 10 min since_date,
sleep(10 * 60) )
sleep(args.sleep)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -8,3 +8,5 @@ services:
IMAP_URL: my.iamthefij.com IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}" IMAP_PASS: "${IMAP_PASS}"
INDEXER: http://indexer:5000
PARSER_1: http://parser_package_tracking:3000

View File

@ -1,3 +1,3 @@
python-dateutil python-dateutil
imbox
requests requests
git+https://github.com/martinrusev/imbox@fd68b35e22686f43cdb7e3df344efc9b3a26b1e6