Compare commits

...

16 Commits

23 changed files with 435 additions and 145 deletions

2
.gitignore vendored
View File

@ -125,3 +125,5 @@ _testmain.go
.DS_Store
.env
docker-compose-prod.yml
.mypy_cache

39
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,39 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.3.0
hooks:
- id: check-added-large-files
- id: check-yaml
args:
- --allow-multiple-documents
- id: check-json
- id: requirements-txt-fixer
- id: trailing-whitespace
- id: end-of-file-fixer
# Python
- id: debug-statements
- id: check-merge-conflict
- id: name-tests-test
# exclude: tests/(common.py|util.py|(helpers)/(.+).py)
- repo: https://github.com/psf/black
rev: 20.8b1
hooks:
- id: black
- repo: git://github.com/asottile/reorder_python_imports
rev: v2.3.6
hooks:
- id: reorder-python-imports
args:
- --py3-plus
- repo: git://github.com/dnephin/pre-commit-golang
rev: v0.3.5
hooks:
- id: go-fmt
- id: go-imports
- id: golangci-lint
# - repo: git://github.com/jumanjihouse/pre-commit-hooks
# rev: 1.11.0
# hooks:
# - id: rubocop
# - id: fasterer

21
Makefile Normal file
View File

@ -0,0 +1,21 @@
SUBDIRS := $(wildcard */.)
.PHONY: default test clean
default: check
.PHONY: all
all: $(SUBDIRS)
.PHONY: $(SUBDIRS)
$(SUBDIRS):
$(MAKE) -C $@
# Installs pre-commit hooks
.PHONY: install-hooks
install-hooks:
pre-commit install -f --install-hooks
# Checks files for encryption
.PHONY: check
check:
pre-commit run --all-files

View File

@ -1,3 +1,3 @@
FROM python:3.6-onbuild
CMD python -m crawler.main
CMD python -m crawler.main -v

4
crawler/Makefile Normal file
View File

@ -0,0 +1,4 @@
.PHONY: default all test clean
default:
@echo ok

View File

@ -1,35 +1,46 @@
from datetime import date
import logging
import os
from argparse import ArgumentParser
from datetime import datetime
from datetime import timedelta
from getpass import getpass
from imaplib import IMAP4
from time import sleep
import email
import json
import os
import requests
from dateutil import parser
from dateutil.tz import tzutc
from imbox import Imbox
import requests
VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
logging.basicConfig(
level=logging.WARNING, format="%(asctime)s %(levelname)s %(name)s %(message)s"
)
logging.getLogger(__name__).addHandler(logging.NullHandler())
VALID_CONTENT_TYPES = ["text/plain", "text/html"]
def get_message_subject(message):
"""Returns message subject or a placeholder text"""
return getattr(message, "subject", "NO SUBJECT")
class MailCrawler(object):
parser_hosts = None
indexer_host = os.environ['INDEXER']
def __init__(self):
self.imap_url = os.environ['IMAP_URL']
self.imap_user = os.environ['IMAP_USER']
self.imap_pass = os.environ['IMAP_PASS']
self._logger = logging.getLogger(self.__class__.__name__)
self.imap_url = os.environ["IMAP_URL"]
self.imap_user = os.environ["IMAP_USER"]
self.imap_pass = os.environ["IMAP_PASS"]
self.parser_hosts = None
self.indexer_host = os.environ.get("INDEXER")
self.debug_mode = os.environ.get("DEBUG", False)
def get_parsers(self):
"""Retrieves a list of parser hosts"""
if self.parser_hosts is None:
self.parser_hosts = []
parser_format = 'PARSER_{}'
parser_format = "PARSER_{}"
parser_index = 1
parser_host = os.environ.get(parser_format.format(parser_index))
while parser_host is not None:
@ -43,21 +54,21 @@ class MailCrawler(object):
"""Parses tokens from an email message"""
text = self.get_email_text(message)
if not text:
print('No email text returned')
print("No email text returned")
return []
results = []
for parser_host in self.get_parsers():
# print('Parsing email text... ', text)
response = requests.post(
parser_host+'/parse',
parser_host + "/parse",
json={
'subject': message.subject,
'message': text,
"subject": get_message_subject(message),
"message": text,
},
)
response.raise_for_status()
print('Got response', response.text)
print("Got response", response.text)
results += response.json()
return results
@ -72,16 +83,18 @@ class MailCrawler(object):
def get_email_text(self, message):
"""Retrieves the text body of an email message"""
body = message.body.get('plain') or message.body.get('html')
body = message.body.get("plain") or message.body.get("html")
if not body:
return None
# Concat all known body content together since it doesn't really matter
return ''.join([text for text in body if isinstance(text, str)])
return "".join([text for text in body if isinstance(text, str)])
def index_token(self, message):
"""Sends a token from the parser to the indexer"""
if self.indexer_host is None and self.debug_mode:
print("DDB No indexer host, but OK for debugging")
response = requests.post(
self.indexer_host+'/token',
self.indexer_host + "/token",
json=message,
)
response.raise_for_status()
@ -90,59 +103,115 @@ class MailCrawler(object):
def process_message(self, message):
"""Process a single email message"""
for result in self.parse_message(message):
result.update({
'subject': message.subject,
})
print('Parsed result: ', result)
print('Indexed result: ', self.index_token(result))
result.update(
{
"subject": message.subject,
}
)
print("Parsed result: ", result)
print("Indexed result: ", self.index_token(result))
def process_messages(self, server, since_date, last_message=0):
for uid, message in server.messages(date__gt=since_date):
def process_messages(self, server, since_date, last_uid=0):
kwargs = {}
if last_uid > 0:
kwargs["uid__range"] = "{}:*".format(last_uid)
else:
kwargs["date__gt"] = since_date
self._logger.info("Mailbox search kwargs %s", kwargs)
for uid, message in server.messages(**kwargs):
uid = int(uid)
if uid <= last_message:
print('DDB Already seen message with uid {}. Skipping'.format(uid))
if uid <= last_uid:
self._logger.debug(
"DDB Already seen message with uid {}. Skipping".format(uid)
)
continue
print(
'Processing message uid {} message_id {} '
'with subject "{}"'.format(
uid, message.message_id, message.subject
)
self._logger.info(
"Processing message uid %s message_id %s with subject '%s'",
uid,
message.message_id,
get_message_subject(message),
)
self.process_message(message)
# Update since_date
message_date = parser.parse(message.date)
print('DDB Processed message. Message date: {} Old date: {}'.format(
message_date, since_date
))
since_date = max(since_date, message_date)
print('DDB Since date is now ', since_date)
last_message = max(uid, last_message)
self._logger.debug(
"DDB Processed message. Message date: %s Old date: %s",
message_date,
since_date,
)
try:
since_date = max(since_date, message_date)
except TypeError:
self._logger.error("Error comparing dates. We'll just use the last one")
self._logger.debug("DDB Since date is now %s", since_date)
last_uid = max(uid, last_uid)
return since_date, last_message
return since_date, last_uid
def _parse_args(self, args=None):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description="Inbox crawler")
parser.add_argument(
"--sleep",
"-s",
default=10 * 60,
help=(
"Number of seconds to wait between polling IMAP server."
"Default 10 min"
),
)
parser.add_argument(
"--verbosity",
"-v",
action="count",
help=(
"Adjust log verbosity by increasing arg count. Default log",
"level is ERROR. Level increases with each `v`",
),
)
return parser.parse_args(args)
def run(self):
print('Starting crawler')
# TODO: Put server into some kind of context manager and property
def _set_log_level(self, verbosity):
"""Sets the log level for the class using the provided verbosity count"""
if verbosity == 1:
# Set this logger to info
self._logger.setLevel(logging.INFO)
elif verbosity == 2:
# Set this logger to debug and root to info
logging.getLogger().setLevel(logging.INFO)
self._logger.setLevel(logging.DEBUG)
elif verbosity >= 3:
# Set the root logger to debug
logging.getLogger().setLevel(logging.DEBUG)
def run(self, args=None):
args = self._parse_args(args=args)
if args.verbosity:
self._set_log_level(args.verbosity)
self._logger.info("Starting crawler")
with self.get_server() as server:
# TODO: parameterize startup date, maybe relative
since_date = datetime.now(tzutc()) - timedelta(days=1)
last_message = 0
since_date = datetime.now(tzutc()) - timedelta(days=16)
last_uid = 0
while True:
print('Lets process')
since_date, last_message = self.process_messages(
server,
since_date,
last_message=last_message
print("Processing messages")
since_date, last_uid = self.process_messages(
server, since_date, last_uid=last_uid
)
print('DDB Processed all. New since_date', since_date)
# TODO: parameterize sleep
# Sleep for 10 min
sleep(10 * 60)
self._logger.info(
"DDB Processed all. New since_date %s",
since_date,
)
sleep(args.sleep)
if __name__ == '__main__':
MailCrawler().run()
if __name__ == "__main__":
while True:
try:
MailCrawler().run()
except IMAP4.abort:
print("Imap abort. We will try to reconnect")
pass

View File

@ -8,3 +8,5 @@ services:
IMAP_URL: my.iamthefij.com
IMAP_USER: iamthefij@iamthefij.com
IMAP_PASS: "${IMAP_PASS}"
INDEXER: http://indexer:5000
PARSER_1: http://parser_package_tracking:3000

View File

@ -1,3 +1,3 @@
python-dateutil
imbox
python-dateutil
requests

4
indexer/Makefile Normal file
View File

@ -0,0 +1,4 @@
.PHONY: default all test clean
default:
@echo ok

View File

@ -2,24 +2,25 @@ import json
import os
import sys
import flask
from flask import jsonify
from flask import request
from flask.ext.sqlalchemy import SQLAlchemy
import flask
app = flask.Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = os.environ.get(
'SQLALCHEMY_DATABASE_URI',
'sqlite:///../tokens.db'
app.config["SQLALCHEMY_DATABASE_URI"] = os.environ.get(
"SQLALCHEMY_DATABASE_URI", "sqlite:///../tokens.db"
)
app.config['SQLALCHEMY_ECHO'] = True
app.config['DEBUG'] = True
app.config["SQLALCHEMY_ECHO"] = True
app.config["DEBUG"] = True
db = SQLAlchemy(app)
class EmailToken(db.Model):
"""Model to store the indexed tokens"""
id = db.Column(db.Integer, primary_key=True)
subject = db.Column(db.String(1024))
token = db.Column(db.String(1024))
@ -34,46 +35,47 @@ class EmailToken(db.Model):
def as_dict(self):
return {
'id': self.id,
'subject': self.subject,
'token': self.token,
'type': self.token_type,
'metadata': self.get_token_metadata(),
'disabled': self.disabled,
"id": self.id,
"subject": self.subject,
"token": self.token,
"type": self.token_type,
"metadata": self.get_token_metadata(),
"disabled": self.disabled,
}
@classmethod
def from_json(cls, data):
metadata = data.get('metadata')
metadata = data.get("metadata")
try:
metadata = json.dumps(metadata)
except TypeError as err:
print('Error dumping metadata', err, file=sys.stderr)
print("Error dumping metadata", err, file=sys.stderr)
return cls(
subject=data.get('subject'),
token=data.get('token'),
token_type=data.get('type'),
subject=data.get("subject"),
token=data.get("token"),
token_type=data.get("type"),
token_metadata=metadata,
disabled=data.get('disabled', False),
disabled=data.get("disabled", False),
)
@classmethod
def jsonify_all(cls, token_type=None):
def jsonify_all(cls, token_type=None, desc=False):
query = cls.query
if token_type:
print('Filtering query by token type', file=sys.stderr)
results = cls.query.filter_by(token_type=token_type).all()
else:
results = cls.query.all()
return jsonify(tokens=[token.as_dict() for token in results])
print("Filtering query by token type", file=sys.stderr)
query = query.filter_by(token_type=token_type)
if desc:
query = query.order_by(cls.id.desc())
return jsonify(tokens=[token.as_dict() for token in query.all()])
@app.route('/')
@app.route("/")
def check():
return 'OK'
return "OK"
@app.route('/token', methods=['POST'])
@app.route("/token", methods=["POST"])
def create_tokens():
"""Creates a token from posted JSON request"""
new_token = EmailToken.from_json(request.get_json(force=True))
@ -84,47 +86,41 @@ def create_tokens():
).first()
print(
'Received token with value {} and type {}'.format(
new_token.token, new_token.token_type
), file=sys.stderr
"Received token with value {} and type {}".format(
new_token.token, new_token.token_type
),
file=sys.stderr,
)
print('Existing token? ', existing_token, file=sys.stderr)
print("Existing token? ", existing_token, file=sys.stderr)
if not existing_token:
print('No existing token, creating a new one', file=sys.stderr)
print("No existing token, creating a new one", file=sys.stderr)
db.session.add(new_token)
db.session.commit()
db.session.refresh(new_token)
return jsonify(
success=True,
created=True,
record=new_token.as_dict()
)
return jsonify(success=True, created=True, record=new_token.as_dict())
else:
print('Found an existing token', file=sys.stderr)
return jsonify(
success=True,
created=False,
record=existing_token.as_dict()
)
print("Found an existing token", file=sys.stderr)
return jsonify(success=True, created=False, record=existing_token.as_dict())
@app.route('/token', methods=['GET'])
@app.route("/token", methods=["GET"])
def list_all_tokens():
"""Lists all tokens with an optional type filter"""
token_type = request.args.get('filter_type')
print('Asked to filter by ', token_type, file=sys.stderr)
return EmailToken.jsonify_all(token_type=token_type)
token_type = request.args.get("filter_type")
desc = request.args.get("desc", False)
print("Asked to filter by ", token_type, file=sys.stderr)
return EmailToken.jsonify_all(token_type=token_type, desc=desc)
@app.route('/token/<int:token_id>', methods=['GET'])
@app.route("/token/<int:token_id>", methods=["GET"])
def get_token(token_id):
"""Gets a token by its primary key id"""
token = EmailToken.query.get(token_id)
return jsonify(token.as_dict())
if __name__ == '__main__':
if __name__ == "__main__":
db.create_all()
app.run(host='0.0.0.0', port=5000)
app.run(host="0.0.0.0", port=5000)

View File

@ -1,3 +1,3 @@
flask==0.12.2
sqlalchemy==1.2.2
flask-sqlalchemy==2.3.2
sqlalchemy==1.2.2

4
parsers/Makefile Normal file
View File

@ -0,0 +1,4 @@
.PHONY: default all test clean
default:
@echo ok

View File

@ -1,14 +1,14 @@
FROM ruby:2.5.0
# TODO: Move to Gemfile
RUN gem install tracking_number -v 1.0.3
RUN gem install sinatra -v 2.0
EXPOSE 3000
RUN mkdir -p /src
WORKDIR /src
EXPOSE 3000
COPY Gemfile /src/
COPY Gemfile.lock /src/
RUN bundle install
COPY main.rb /src/
CMD ruby main.rb

View File

@ -0,0 +1,5 @@
source 'https://rubygems.org'
ruby '2.5'
gem 'tracking_number', '1.0.3'
gem 'sinatra', '2.0.1'

View File

@ -0,0 +1,40 @@
GEM
remote: https://rubygems.org/
specs:
activesupport (5.1.5)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (~> 0.7)
minitest (~> 5.1)
tzinfo (~> 1.1)
concurrent-ruby (1.0.5)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
minitest (5.11.3)
mustermann (1.0.2)
rack (2.0.4)
rack-protection (2.0.1)
rack
sinatra (2.0.1)
mustermann (~> 1.0)
rack (~> 2.0)
rack-protection (= 2.0.1)
tilt (~> 2.0)
thread_safe (0.3.6)
tilt (2.0.8)
tracking_number (1.0.3)
activesupport (>= 4.2.5)
tzinfo (1.2.5)
thread_safe (~> 0.1)
PLATFORMS
ruby
DEPENDENCIES
sinatra (= 2.0.1)
tracking_number (= 1.0.3)
RUBY VERSION
ruby 2.5.0p0
BUNDLED WITH
1.16.1

4
viewers/Makefile Normal file
View File

@ -0,0 +1,4 @@
.PHONY: default all test clean
default:
@echo ok

View File

@ -1,3 +1,2 @@
flask==0.12.2
requests
flask_bootstrap

View File

@ -1,45 +1,51 @@
import os
import sys
from flask_bootstrap import Bootstrap
import flask
import requests
app = flask.Flask(__name__)
app.config['DEBUG'] = True
Bootstrap(app)
app.config["DEBUG"] = True
indexer_url = os.environ.get('INDEXER_URL', 'http://indexer')
indexer_url = os.environ.get("INDEXER_URL", "http://indexer")
@app.route('/')
def check():
return 'OK'
@app.route("/healthcheck")
def healthcheck():
return "OK"
@app.route('/shipping')
@app.route("/")
def home():
return flask.render_template("home.html")
@app.route("/shipping")
def get_tokens():
resp = requests.get(
indexer_url+'/token',
params={'filter_type': 'SHIPPING'},
indexer_url + "/token",
params={
"filter_type": "SHIPPING",
"desc": True,
},
)
resp.raise_for_status()
tokens = resp.json().get('tokens')
tokens = resp.json().get("tokens")
for token in tokens:
try:
resp = requests.get(
'http://viewer_package_tracking:3000/info/'+token['token']
"http://viewer_package_tracking:3000/info/" + token["token"]
)
resp.raise_for_status()
print('Response: ', resp.text, file=sys.stderr)
print("Response: ", resp.text, file=sys.stderr)
info = resp.json()
token['metadata'].update(info)
token["metadata"].update(info)
except Exception as e:
print('Error', e, file=sys.stderr)
print("Error", e, file=sys.stderr)
pass
return flask.render_template('shipping.html', trackers=tokens)
return flask.render_template("shipping.html", trackers=tokens)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000)

View File

@ -0,0 +1,17 @@
{% extends "base.html" %}
{% block title %}Home{% endblock %}
{% block navbar_nav %}
<li class="nav-item">
<a class="nav-link" href="/shipping">Shipping</a>
</li>
{% endblock %}
{% block container %}
<div class="row mt-3">
<div class="col">
<h2>Email Assistant</h2>
<p>Let's get trackin!</p>
</div>
</div>
{% endblock %}

View File

@ -16,7 +16,7 @@
</div>
<div class="row mt-2">
<div class="col">
<table class="table">
<table class="table table-responsive">
<thead>
<tr>
<th scope="col">#</th>

View File

@ -1,14 +1,14 @@
FROM ruby:2.5.0
# TODO: Move to Gemfile
RUN gem install trackerific -v 0.8.0
RUN gem install sinatra -v 2.0
EXPOSE 3000
RUN mkdir -p /src
WORKDIR /src
EXPOSE 3000
COPY Gemfile /src/
COPY Gemfile.lock /src/
RUN bundle install
COPY main.rb /src/
CMD ruby main.rb

View File

@ -0,0 +1,5 @@
source 'https://rubygems.org'
ruby '2.5'
gem 'trackerific', '0.8.0'
gem 'sinatra', '2.0.1'

View File

@ -0,0 +1,73 @@
GEM
remote: https://rubygems.org/
specs:
activesupport (5.1.5)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (~> 0.7)
minitest (~> 5.1)
tzinfo (~> 1.1)
akami (1.2.2)
gyoku (>= 0.4.0)
nokogiri
builder (3.2.3)
concurrent-ruby (1.0.5)
gyoku (1.1.1)
builder (>= 2.1.2)
httparty (0.16.0)
multi_xml (>= 0.5.2)
httpi (2.1.1)
rack
rubyntlm (~> 0.3.2)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
mime-types (1.25.1)
mini_portile2 (2.3.0)
minitest (5.11.3)
multi_xml (0.6.0)
mustermann (1.0.2)
nokogiri (1.8.2)
mini_portile2 (~> 2.3.0)
nori (2.3.0)
rack (2.0.4)
rack-protection (2.0.1)
rack
rubyntlm (0.3.4)
savon (2.3.3)
akami (~> 1.2.0)
builder (>= 2.1.2)
gyoku (~> 1.1.0)
httpi (~> 2.1.0)
nokogiri (>= 1.4.0)
nori (~> 2.3.0)
wasabi (~> 3.2.2)
sinatra (2.0.1)
mustermann (~> 1.0)
rack (~> 2.0)
rack-protection (= 2.0.1)
tilt (~> 2.0)
thread_safe (0.3.6)
tilt (2.0.8)
trackerific (0.8.0)
activesupport
builder
httparty (>= 0.12.0)
savon (~> 2.3.0)
tzinfo (1.2.5)
thread_safe (~> 0.1)
wasabi (3.2.3)
httpi (~> 2.0)
mime-types (< 2.0.0)
nokogiri (>= 1.4.0)
PLATFORMS
ruby
DEPENDENCIES
sinatra (= 2.0.1)
trackerific (= 0.8.0)
RUBY VERSION
ruby 2.5.0p0
BUNDLED WITH
1.16.1