Crawler now logs results of parser to console
This commit is contained in:
parent
5882fe6ebb
commit
9d88bcf1b2
@ -1,21 +1,52 @@
|
|||||||
from getpass import getpass
|
from getpass import getpass
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
import json
|
||||||
import email
|
import email
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from imapclient import IMAPClient
|
from imapclient import IMAPClient
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
|
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
|
||||||
|
|
||||||
|
|
||||||
class MailCrawler(object):
|
class MailCrawler(object):
|
||||||
|
parser_hosts = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.imap_url = os.environ['IMAP_URL']
|
self.imap_url = os.environ['IMAP_URL']
|
||||||
self.imap_user = os.environ['IMAP_USER']
|
self.imap_user = os.environ['IMAP_USER']
|
||||||
self.imap_pass = os.environ['IMAP_PASS']
|
self.imap_pass = os.environ['IMAP_PASS']
|
||||||
|
|
||||||
|
def get_parsers(self):
|
||||||
|
if self.parser_hosts is None:
|
||||||
|
self.parser_hosts = []
|
||||||
|
parser_format = 'PARSER_{}'
|
||||||
|
parser_index = 1
|
||||||
|
parser_host = os.environ.get(parser_format.format(parser_index))
|
||||||
|
while parser_host is not None:
|
||||||
|
self.parser_hosts.append(parser_host)
|
||||||
|
parser_index += 1
|
||||||
|
parser_host = os.environ.get(parser_format.format(parser_index))
|
||||||
|
|
||||||
|
return self.parser_hosts
|
||||||
|
|
||||||
|
def parse_message(self, message):
|
||||||
|
text = self.get_email_text(message)
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for parser_host in self.get_parsers():
|
||||||
|
response = requests.post(
|
||||||
|
parser_host+'/parse',
|
||||||
|
json={'message': text},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
results += response.json()
|
||||||
|
return results
|
||||||
|
|
||||||
def get_server(self):
|
def get_server(self):
|
||||||
server = IMAPClient(self.imap_url, use_uid=True)
|
server = IMAPClient(self.imap_url, use_uid=True)
|
||||||
server.login(self.imap_user, self.imap_pass)
|
server.login(self.imap_user, self.imap_pass)
|
||||||
@ -27,7 +58,8 @@ class MailCrawler(object):
|
|||||||
def get_email_text(self, message):
|
def get_email_text(self, message):
|
||||||
if not message.is_multipart():
|
if not message.is_multipart():
|
||||||
if self.is_valid_content_type(message):
|
if self.is_valid_content_type(message):
|
||||||
return message.get_payload(decode=True)
|
# TODO: Check encoding (maybe CHARSET)
|
||||||
|
return message.get_payload(decode=True).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
content_type_to_payload = {
|
content_type_to_payload = {
|
||||||
payload.get_content_type(): self.get_email_text(payload)
|
payload.get_content_type(): self.get_email_text(payload)
|
||||||
@ -39,14 +71,17 @@ class MailCrawler(object):
|
|||||||
return text
|
return text
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
server = self.get_server()
|
server = self.get_server()
|
||||||
server.select_folder('INBOX')
|
server.select_folder('INBOX')
|
||||||
message_ids = server.search(['SINCE', date(2018, 1, 31)])
|
message_ids = server.search(['SINCE', date(2018, 1, 31)])
|
||||||
for msgid, data in server.fetch(message_ids, 'RFC822').items():
|
for msgid, data in server.fetch(message_ids, 'RFC822').items():
|
||||||
email_message = email.message_from_bytes(data[b'RFC822'])
|
email_message = email.message_from_bytes(data[b'RFC822'])
|
||||||
print(self.get_email_text(email_message))
|
for result in self.parse_message(email_message):
|
||||||
|
result.update({
|
||||||
|
'subject': email_message['SUBJECT'],
|
||||||
|
})
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1 +1,2 @@
|
|||||||
imapclient
|
imapclient
|
||||||
|
requests
|
||||||
|
@ -2,10 +2,13 @@ version: '2'
|
|||||||
services:
|
services:
|
||||||
crawler:
|
crawler:
|
||||||
build: ./crawler
|
build: ./crawler
|
||||||
|
links:
|
||||||
|
- parser_package_tracking
|
||||||
environment:
|
environment:
|
||||||
IMAP_URL: my.iamthefij.com
|
IMAP_URL: my.iamthefij.com
|
||||||
IMAP_USER: iamthefij@iamthefij.com
|
IMAP_USER: iamthefij@iamthefij.com
|
||||||
IMAP_PASS: "${IMAP_PASS}"
|
IMAP_PASS: "${IMAP_PASS}"
|
||||||
|
PARSER_1: http://parser_package_tracking:3000
|
||||||
parser_package_tracking:
|
parser_package_tracking:
|
||||||
build: ./parsers/package-tracking
|
build: ./parsers/package-tracking
|
||||||
ports:
|
ports:
|
||||||
|
Loading…
Reference in New Issue
Block a user