Crawler now logs results of parser to console

This commit is contained in:
IamTheFij 2018-02-02 19:19:29 -08:00
parent 5882fe6ebb
commit 9d88bcf1b2
3 changed files with 42 additions and 3 deletions

View File

@ -1,21 +1,52 @@
from getpass import getpass from getpass import getpass
from datetime import date from datetime import date
import json
import email import email
import os import os
from imapclient import IMAPClient from imapclient import IMAPClient
import requests
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ] VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
class MailCrawler(object): class MailCrawler(object):
parser_hosts = None
def __init__(self): def __init__(self):
self.imap_url = os.environ['IMAP_URL'] self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER'] self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS'] self.imap_pass = os.environ['IMAP_PASS']
def get_parsers(self):
if self.parser_hosts is None:
self.parser_hosts = []
parser_format = 'PARSER_{}'
parser_index = 1
parser_host = os.environ.get(parser_format.format(parser_index))
while parser_host is not None:
self.parser_hosts.append(parser_host)
parser_index += 1
parser_host = os.environ.get(parser_format.format(parser_index))
return self.parser_hosts
def parse_message(self, message):
text = self.get_email_text(message)
if not text:
return []
results = []
for parser_host in self.get_parsers():
response = requests.post(
parser_host+'/parse',
json={'message': text},
)
response.raise_for_status()
results += response.json()
return results
def get_server(self): def get_server(self):
server = IMAPClient(self.imap_url, use_uid=True) server = IMAPClient(self.imap_url, use_uid=True)
server.login(self.imap_user, self.imap_pass) server.login(self.imap_user, self.imap_pass)
@ -27,7 +58,8 @@ class MailCrawler(object):
def get_email_text(self, message): def get_email_text(self, message):
if not message.is_multipart(): if not message.is_multipart():
if self.is_valid_content_type(message): if self.is_valid_content_type(message):
return message.get_payload(decode=True) # TODO: Check encoding (maybe CHARSET)
return message.get_payload(decode=True).decode("utf-8")
else: else:
content_type_to_payload = { content_type_to_payload = {
payload.get_content_type(): self.get_email_text(payload) payload.get_content_type(): self.get_email_text(payload)
@ -39,14 +71,17 @@ class MailCrawler(object):
return text return text
return None return None
def run(self): def run(self):
server = self.get_server() server = self.get_server()
server.select_folder('INBOX') server.select_folder('INBOX')
message_ids = server.search(['SINCE', date(2018, 1, 31)]) message_ids = server.search(['SINCE', date(2018, 1, 31)])
for msgid, data in server.fetch(message_ids, 'RFC822').items(): for msgid, data in server.fetch(message_ids, 'RFC822').items():
email_message = email.message_from_bytes(data[b'RFC822']) email_message = email.message_from_bytes(data[b'RFC822'])
print(self.get_email_text(email_message)) for result in self.parse_message(email_message):
result.update({
'subject': email_message['SUBJECT'],
})
print(result)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1 +1,2 @@
imapclient imapclient
requests

View File

@ -2,10 +2,13 @@ version: '2'
services: services:
crawler: crawler:
build: ./crawler build: ./crawler
links:
- parser_package_tracking
environment: environment:
IMAP_URL: my.iamthefij.com IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}" IMAP_PASS: "${IMAP_PASS}"
PARSER_1: http://parser_package_tracking:3000
parser_package_tracking: parser_package_tracking:
build: ./parsers/package-tracking build: ./parsers/package-tracking
ports: ports: