diff --git a/.gitignore b/.gitignore index 86e30db..343583b 100644 --- a/.gitignore +++ b/.gitignore @@ -126,3 +126,4 @@ _testmain.go .DS_Store .env docker-compose-prod.yml +.mypy_cache diff --git a/crawler/crawler/main.py b/crawler/crawler/main.py index 8595677..25a4ed5 100644 --- a/crawler/crawler/main.py +++ b/crawler/crawler/main.py @@ -13,36 +13,34 @@ from imbox import Imbox logging.basicConfig( - level=logging.WARNING, - format='%(asctime)s %(levelname)s %(name)s %(message)s' + level=logging.WARNING, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) logging.getLogger(__name__).addHandler(logging.NullHandler()) -VALID_CONTENT_TYPES = ['text/plain', 'text/html'] +VALID_CONTENT_TYPES = ["text/plain", "text/html"] def get_message_subject(message): """Returns message subject or a placeholder text""" - return getattr(message, 'subject', 'NO SUBJECT') + return getattr(message, "subject", "NO SUBJECT") class MailCrawler(object): - def __init__(self): self._logger = logging.getLogger(self.__class__.__name__) - self.imap_url = os.environ['IMAP_URL'] - self.imap_user = os.environ['IMAP_USER'] - self.imap_pass = os.environ['IMAP_PASS'] + self.imap_url = os.environ["IMAP_URL"] + self.imap_user = os.environ["IMAP_USER"] + self.imap_pass = os.environ["IMAP_PASS"] self.parser_hosts = None - self.indexer_host = os.environ.get('INDEXER') - self.debug_mode = os.environ.get('DEBUG', False) + self.indexer_host = os.environ.get("INDEXER") + self.debug_mode = os.environ.get("DEBUG", False) def get_parsers(self): """Retrieves a list of parser hosts""" if self.parser_hosts is None: self.parser_hosts = [] - parser_format = 'PARSER_{}' + parser_format = "PARSER_{}" parser_index = 1 parser_host = os.environ.get(parser_format.format(parser_index)) while parser_host is not None: @@ -56,21 +54,21 @@ class MailCrawler(object): """Parses tokens from an email message""" text = self.get_email_text(message) if not text: - print('No email text returned') + print("No email text returned") return [] results = [] for parser_host in self.get_parsers(): # print('Parsing email text... ', text) response = requests.post( - parser_host+'/parse', + parser_host + "/parse", json={ - 'subject': get_message_subject(message), - 'message': text, + "subject": get_message_subject(message), + "message": text, }, ) response.raise_for_status() - print('Got response', response.text) + print("Got response", response.text) results += response.json() return results @@ -85,18 +83,18 @@ class MailCrawler(object): def get_email_text(self, message): """Retrieves the text body of an email message""" - body = message.body.get('plain') or message.body.get('html') + body = message.body.get("plain") or message.body.get("html") if not body: return None # Concat all known body content together since it doesn't really matter - return ''.join([text for text in body if isinstance(text, str)]) + return "".join([text for text in body if isinstance(text, str)]) def index_token(self, message): """Sends a token from the parser to the indexer""" if self.indexer_host is None and self.debug_mode: print("DDB No indexer host, but OK for debugging") response = requests.post( - self.indexer_host+'/token', + self.indexer_host + "/token", json=message, ) response.raise_for_status() @@ -105,9 +103,11 @@ class MailCrawler(object): def process_message(self, message): """Process a single email message""" for result in self.parse_message(message): - result.update({ - "subject": message.subject, - }) + result.update( + { + "subject": message.subject, + } + ) print("Parsed result: ", result) print("Indexed result: ", self.index_token(result)) @@ -138,14 +138,13 @@ class MailCrawler(object): message_date = parser.parse(message.date) self._logger.debug( "DDB Processed message. Message date: %s Old date: %s", - message_date, since_date + message_date, + since_date, ) try: since_date = max(since_date, message_date) except TypeError: - self._logger.error( - "Error comparing dates. We'll just use the last one" - ) + self._logger.error("Error comparing dates. We'll just use the last one") self._logger.debug("DDB Since date is now %s", since_date) last_uid = max(uid, last_uid) @@ -155,16 +154,22 @@ class MailCrawler(object): """Parses command line arguments and returns them""" parser = ArgumentParser(description="Inbox crawler") parser.add_argument( - "--sleep", "-s", - default=10*60, - help=("Number of seconds to wait between polling IMAP server." - "Default 10 min"), + "--sleep", + "-s", + default=10 * 60, + help=( + "Number of seconds to wait between polling IMAP server." + "Default 10 min" + ), ) parser.add_argument( - "--verbosity", "-v", + "--verbosity", + "-v", action="count", - help=("Adjust log verbosity by increasing arg count. Default log", - "level is ERROR. Level increases with each `v`"), + help=( + "Adjust log verbosity by increasing arg count. Default log", + "level is ERROR. Level increases with each `v`", + ), ) return parser.parse_args(args) @@ -186,7 +191,7 @@ class MailCrawler(object): if args.verbosity: self._set_log_level(args.verbosity) - self._logger.info('Starting crawler') + self._logger.info("Starting crawler") with self.get_server() as server: # TODO: parameterize startup date, maybe relative since_date = datetime.now(tzutc()) - timedelta(days=16) @@ -194,9 +199,7 @@ class MailCrawler(object): while True: print("Processing messages") since_date, last_uid = self.process_messages( - server, - since_date, - last_uid=last_uid + server, since_date, last_uid=last_uid ) self._logger.info( "DDB Processed all. New since_date %s", @@ -205,10 +208,10 @@ class MailCrawler(object): sleep(args.sleep) -if __name__ == '__main__': +if __name__ == "__main__": while True: try: MailCrawler().run() except IMAP4.abort: - print('Imap abort. We will try to reconnect') + print("Imap abort. We will try to reconnect") pass diff --git a/crawler/requirements.txt b/crawler/requirements.txt index 13913f8..69b8301 100644 --- a/crawler/requirements.txt +++ b/crawler/requirements.txt @@ -1,3 +1,3 @@ +imbox python-dateutil requests -git+https://github.com/martinrusev/imbox@fd68b35e22686f43cdb7e3df344efc9b3a26b1e6 diff --git a/indexer/indexer/main.py b/indexer/indexer/main.py index ba1c050..049fa05 100644 --- a/indexer/indexer/main.py +++ b/indexer/indexer/main.py @@ -2,24 +2,25 @@ import json import os import sys +import flask from flask import jsonify from flask import request from flask.ext.sqlalchemy import SQLAlchemy -import flask app = flask.Flask(__name__) -app.config['SQLALCHEMY_DATABASE_URI'] = os.environ.get( - 'SQLALCHEMY_DATABASE_URI', - 'sqlite:///../tokens.db' +app.config["SQLALCHEMY_DATABASE_URI"] = os.environ.get( + "SQLALCHEMY_DATABASE_URI", "sqlite:///../tokens.db" ) -app.config['SQLALCHEMY_ECHO'] = True -app.config['DEBUG'] = True +app.config["SQLALCHEMY_ECHO"] = True +app.config["DEBUG"] = True db = SQLAlchemy(app) + class EmailToken(db.Model): """Model to store the indexed tokens""" + id = db.Column(db.Integer, primary_key=True) subject = db.Column(db.String(1024)) token = db.Column(db.String(1024)) @@ -34,47 +35,47 @@ class EmailToken(db.Model): def as_dict(self): return { - 'id': self.id, - 'subject': self.subject, - 'token': self.token, - 'type': self.token_type, - 'metadata': self.get_token_metadata(), - 'disabled': self.disabled, + "id": self.id, + "subject": self.subject, + "token": self.token, + "type": self.token_type, + "metadata": self.get_token_metadata(), + "disabled": self.disabled, } @classmethod def from_json(cls, data): - metadata = data.get('metadata') + metadata = data.get("metadata") try: metadata = json.dumps(metadata) except TypeError as err: - print('Error dumping metadata', err, file=sys.stderr) + print("Error dumping metadata", err, file=sys.stderr) return cls( - subject=data.get('subject'), - token=data.get('token'), - token_type=data.get('type'), + subject=data.get("subject"), + token=data.get("token"), + token_type=data.get("type"), token_metadata=metadata, - disabled=data.get('disabled', False), + disabled=data.get("disabled", False), ) @classmethod def jsonify_all(cls, token_type=None, desc=False): query = cls.query if token_type: - print('Filtering query by token type', file=sys.stderr) + print("Filtering query by token type", file=sys.stderr) query = query.filter_by(token_type=token_type) if desc: query = query.order_by(cls.id.desc()) return jsonify(tokens=[token.as_dict() for token in query.all()]) -@app.route('/') +@app.route("/") def check(): - return 'OK' + return "OK" -@app.route('/token', methods=['POST']) +@app.route("/token", methods=["POST"]) def create_tokens(): """Creates a token from posted JSON request""" new_token = EmailToken.from_json(request.get_json(force=True)) @@ -85,48 +86,41 @@ def create_tokens(): ).first() print( - 'Received token with value {} and type {}'.format( - new_token.token, new_token.token_type - ), file=sys.stderr + "Received token with value {} and type {}".format( + new_token.token, new_token.token_type + ), + file=sys.stderr, ) - print('Existing token? ', existing_token, file=sys.stderr) + print("Existing token? ", existing_token, file=sys.stderr) if not existing_token: - print('No existing token, creating a new one', file=sys.stderr) + print("No existing token, creating a new one", file=sys.stderr) db.session.add(new_token) db.session.commit() db.session.refresh(new_token) - return jsonify( - success=True, - created=True, - record=new_token.as_dict() - ) + return jsonify(success=True, created=True, record=new_token.as_dict()) else: - print('Found an existing token', file=sys.stderr) - return jsonify( - success=True, - created=False, - record=existing_token.as_dict() - ) + print("Found an existing token", file=sys.stderr) + return jsonify(success=True, created=False, record=existing_token.as_dict()) -@app.route('/token', methods=['GET']) +@app.route("/token", methods=["GET"]) def list_all_tokens(): """Lists all tokens with an optional type filter""" - token_type = request.args.get('filter_type') - desc = request.args.get('desc', False) - print('Asked to filter by ', token_type, file=sys.stderr) + token_type = request.args.get("filter_type") + desc = request.args.get("desc", False) + print("Asked to filter by ", token_type, file=sys.stderr) return EmailToken.jsonify_all(token_type=token_type, desc=desc) -@app.route('/token/', methods=['GET']) +@app.route("/token/", methods=["GET"]) def get_token(token_id): """Gets a token by its primary key id""" token = EmailToken.query.get(token_id) return jsonify(token.as_dict()) -if __name__ == '__main__': +if __name__ == "__main__": db.create_all() - app.run(host='0.0.0.0', port=5000) + app.run(host="0.0.0.0", port=5000) diff --git a/indexer/requirements.txt b/indexer/requirements.txt index 0453f33..da1e6af 100644 --- a/indexer/requirements.txt +++ b/indexer/requirements.txt @@ -1,3 +1,3 @@ flask==0.12.2 -sqlalchemy==1.2.2 flask-sqlalchemy==2.3.2 +sqlalchemy==1.2.2 diff --git a/viewers/main/viewer/main.py b/viewers/main/viewer/main.py index 3130a8b..9911500 100644 --- a/viewers/main/viewer/main.py +++ b/viewers/main/viewer/main.py @@ -6,45 +6,46 @@ import requests app = flask.Flask(__name__) -app.config['DEBUG'] = True +app.config["DEBUG"] = True -indexer_url = os.environ.get('INDEXER_URL', 'http://indexer') +indexer_url = os.environ.get("INDEXER_URL", "http://indexer") -@app.route('/healthcheck') +@app.route("/healthcheck") def healthcheck(): - return 'OK' + return "OK" -@app.route('/') + +@app.route("/") def home(): - return flask.render_template('home.html') + return flask.render_template("home.html") -@app.route('/shipping') +@app.route("/shipping") def get_tokens(): resp = requests.get( - indexer_url+'/token', + indexer_url + "/token", params={ - 'filter_type': 'SHIPPING', - 'desc': True, + "filter_type": "SHIPPING", + "desc": True, }, ) resp.raise_for_status() - tokens = resp.json().get('tokens') + tokens = resp.json().get("tokens") for token in tokens: try: resp = requests.get( - 'http://viewer_package_tracking:3000/info/'+token['token'] + "http://viewer_package_tracking:3000/info/" + token["token"] ) resp.raise_for_status() - print('Response: ', resp.text, file=sys.stderr) + print("Response: ", resp.text, file=sys.stderr) info = resp.json() - token['metadata'].update(info) + token["metadata"].update(info) except Exception as e: - print('Error', e, file=sys.stderr) + print("Error", e, file=sys.stderr) pass - return flask.render_template('shipping.html', trackers=tokens) + return flask.render_template("shipping.html", trackers=tokens) -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5000) +if __name__ == "__main__": + app.run(host="0.0.0.0", port=5000)