Crawler now logs results of parser to console

This commit is contained in:
IamTheFij 2018-02-02 19:19:29 -08:00
parent 5882fe6ebb
commit 9d88bcf1b2
3 changed files with 42 additions and 3 deletions

View File

@ -1,21 +1,52 @@
from getpass import getpass
from datetime import date
import json
import email
import os
from imapclient import IMAPClient
import requests
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
class MailCrawler(object):
parser_hosts = None
def __init__(self):
self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS']
def get_parsers(self):
if self.parser_hosts is None:
self.parser_hosts = []
parser_format = 'PARSER_{}'
parser_index = 1
parser_host = os.environ.get(parser_format.format(parser_index))
while parser_host is not None:
self.parser_hosts.append(parser_host)
parser_index += 1
parser_host = os.environ.get(parser_format.format(parser_index))
return self.parser_hosts
def parse_message(self, message):
text = self.get_email_text(message)
if not text:
return []
results = []
for parser_host in self.get_parsers():
response = requests.post(
parser_host+'/parse',
json={'message': text},
)
response.raise_for_status()
results += response.json()
return results
def get_server(self):
server = IMAPClient(self.imap_url, use_uid=True)
server.login(self.imap_user, self.imap_pass)
@ -27,7 +58,8 @@ class MailCrawler(object):
def get_email_text(self, message):
if not message.is_multipart():
if self.is_valid_content_type(message):
return message.get_payload(decode=True)
# TODO: Check encoding (maybe CHARSET)
return message.get_payload(decode=True).decode("utf-8")
else:
content_type_to_payload = {
payload.get_content_type(): self.get_email_text(payload)
@ -39,14 +71,17 @@ class MailCrawler(object):
return text
return None
def run(self):
server = self.get_server()
server.select_folder('INBOX')
message_ids = server.search(['SINCE', date(2018, 1, 31)])
for msgid, data in server.fetch(message_ids, 'RFC822').items():
email_message = email.message_from_bytes(data[b'RFC822'])
print(self.get_email_text(email_message))
for result in self.parse_message(email_message):
result.update({
'subject': email_message['SUBJECT'],
})
print(result)
if __name__ == '__main__':

View File

@ -1 +1,2 @@
imapclient
requests

View File

@ -2,10 +2,13 @@ version: '2'
services:
crawler:
build: ./crawler
links:
- parser_package_tracking
environment:
IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}"
PARSER_1: http://parser_package_tracking:3000
parser_package_tracking:
build: ./parsers/package-tracking
ports: