Update docker config for crawler service and parser

This commit is contained in:
IamTheFij 2018-02-02 18:03:00 -08:00
parent df4f53dac4
commit 5882fe6ebb
6 changed files with 35 additions and 12 deletions

1
.gitignore vendored
View File

@ -124,3 +124,4 @@ _testmain.go
*.prof *.prof
.DS_Store .DS_Store
.env

3
crawler/Dockerfile Normal file
View File

@ -0,0 +1,3 @@
FROM python:3.6-onbuild
CMD python -m crawler.main

View File

@ -1,26 +1,28 @@
from getpass import getpass from getpass import getpass
from datetime import date from datetime import date
from ipdb import set_trace
import email import email
import os
from imapclient import IMAPClient from imapclient import IMAPClient
class MailCrawler(object): VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
server_url = 'my.iamthefij.com'
valid_content_types = [ 'text/plain', 'text/html' ]
def get_credentials(self):
password = getpass('Password?') class MailCrawler(object):
return ('iamthefij@iamthefij.com', password)
def __init__(self):
self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS']
def get_server(self): def get_server(self):
server = IMAPClient(self.server_url, use_uid=True) server = IMAPClient(self.imap_url, use_uid=True)
server.login(*self.get_credentials()) server.login(self.imap_user, self.imap_pass)
return server return server
def is_valid_content_type(self, message): def is_valid_content_type(self, message):
return message.get_content_type() in self.valid_content_types return message.get_content_type() in VALID_CONTENT_TYPES
def get_email_text(self, message): def get_email_text(self, message):
if not message.is_multipart(): if not message.is_multipart():
@ -31,7 +33,7 @@ class MailCrawler(object):
payload.get_content_type(): self.get_email_text(payload) payload.get_content_type(): self.get_email_text(payload)
for payload in message.get_payload() for payload in message.get_payload()
} }
for content_type in self.valid_content_types: for content_type in VALID_CONTENT_TYPES:
text = content_type_to_payload.get(content_type) text = content_type_to_payload.get(content_type)
if text: if text:
return text return text
@ -44,7 +46,6 @@ class MailCrawler(object):
message_ids = server.search(['SINCE', date(2018, 1, 31)]) message_ids = server.search(['SINCE', date(2018, 1, 31)])
for msgid, data in server.fetch(message_ids, 'RFC822').items(): for msgid, data in server.fetch(message_ids, 'RFC822').items():
email_message = email.message_from_bytes(data[b'RFC822']) email_message = email.message_from_bytes(data[b'RFC822'])
set_trace()
print(self.get_email_text(email_message)) print(self.get_email_text(email_message))

View File

@ -0,0 +1,10 @@
version: '2'
services:
main:
build: .
volumes:
- .:/usr/src/app
environment:
IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}"

View File

@ -1,5 +1,11 @@
version: '2' version: '2'
services: services:
crawler:
build: ./crawler
environment:
IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}"
parser_package_tracking: parser_package_tracking:
build: ./parsers/package-tracking build: ./parsers/package-tracking
ports: ports:

View File

@ -2,5 +2,7 @@ version: '2'
services: services:
main: main:
build: . build: .
volumes:
- .:/src
ports: ports:
- "127.0.0.1:8183:3000" - "127.0.0.1:8183:3000"