From 9d88bcf1b29f8d7bcc895e4eef1db4e743bf6b57 Mon Sep 17 00:00:00 2001 From: Ian Fijolek Date: Fri, 2 Feb 2018 19:19:29 -0800 Subject: [PATCH] Crawler now logs results of parser to console --- crawler/crawler/main.py | 41 +++++++++++++++++++++++++++++++++++++--- crawler/requirements.txt | 1 + docker-compose.yml | 3 +++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/crawler/crawler/main.py b/crawler/crawler/main.py index db01597..afad10f 100644 --- a/crawler/crawler/main.py +++ b/crawler/crawler/main.py @@ -1,21 +1,52 @@ from getpass import getpass from datetime import date +import json import email import os from imapclient import IMAPClient +import requests VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ] class MailCrawler(object): + parser_hosts = None def __init__(self): self.imap_url = os.environ['IMAP_URL'] self.imap_user = os.environ['IMAP_USER'] self.imap_pass = os.environ['IMAP_PASS'] + def get_parsers(self): + if self.parser_hosts is None: + self.parser_hosts = [] + parser_format = 'PARSER_{}' + parser_index = 1 + parser_host = os.environ.get(parser_format.format(parser_index)) + while parser_host is not None: + self.parser_hosts.append(parser_host) + parser_index += 1 + parser_host = os.environ.get(parser_format.format(parser_index)) + + return self.parser_hosts + + def parse_message(self, message): + text = self.get_email_text(message) + if not text: + return [] + + results = [] + for parser_host in self.get_parsers(): + response = requests.post( + parser_host+'/parse', + json={'message': text}, + ) + response.raise_for_status() + results += response.json() + return results + def get_server(self): server = IMAPClient(self.imap_url, use_uid=True) server.login(self.imap_user, self.imap_pass) @@ -27,7 +58,8 @@ class MailCrawler(object): def get_email_text(self, message): if not message.is_multipart(): if self.is_valid_content_type(message): - return message.get_payload(decode=True) + # TODO: Check encoding (maybe CHARSET) + return message.get_payload(decode=True).decode("utf-8") else: content_type_to_payload = { payload.get_content_type(): self.get_email_text(payload) @@ -39,14 +71,17 @@ class MailCrawler(object): return text return None - def run(self): server = self.get_server() server.select_folder('INBOX') message_ids = server.search(['SINCE', date(2018, 1, 31)]) for msgid, data in server.fetch(message_ids, 'RFC822').items(): email_message = email.message_from_bytes(data[b'RFC822']) - print(self.get_email_text(email_message)) + for result in self.parse_message(email_message): + result.update({ + 'subject': email_message['SUBJECT'], + }) + print(result) if __name__ == '__main__': diff --git a/crawler/requirements.txt b/crawler/requirements.txt index 4de4db6..41bd178 100644 --- a/crawler/requirements.txt +++ b/crawler/requirements.txt @@ -1 +1,2 @@ imapclient +requests diff --git a/docker-compose.yml b/docker-compose.yml index 5df8e42..c3597bd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,10 +2,13 @@ version: '2' services: crawler: build: ./crawler + links: + - parser_package_tracking environment: IMAP_URL: my.iamthefij.com IMAP_USER: iamthefij@iamthefij.com IMAP_PASS: "${IMAP_PASS}" + PARSER_1: http://parser_package_tracking:3000 parser_package_tracking: build: ./parsers/package-tracking ports: