Update crawler logging and handling of unicode errors

This commit is contained in:
IamTheFij 2018-02-05 17:53:28 -08:00
parent 271513312f
commit 7595af98f9

View File

@ -13,7 +13,7 @@ VALID_CONTENT_TYPES = [ 'text/plain', 'text/html' ]
class MailCrawler(object): class MailCrawler(object):
parser_hosts = None parser_hosts = None
indexer_host = os.environ["INDEXER"] indexer_host = os.environ['INDEXER']
def __init__(self): def __init__(self):
self.imap_url = os.environ['IMAP_URL'] self.imap_url = os.environ['IMAP_URL']
@ -36,15 +36,21 @@ class MailCrawler(object):
def parse_message(self, message): def parse_message(self, message):
text = self.get_email_text(message) text = self.get_email_text(message)
if not text: if not text:
print('No email text returned')
return [] return []
results = [] results = []
for parser_host in self.get_parsers(): for parser_host in self.get_parsers():
# print('Parsing email text... ', text)
response = requests.post( response = requests.post(
parser_host+'/parse', parser_host+'/parse',
json={'message': text}, json={
'subject': message['SUBJECT'],
'message': text,
},
) )
response.raise_for_status() response.raise_for_status()
print('Got response', response.text)
results += response.json() results += response.json()
return results return results
@ -60,7 +66,11 @@ class MailCrawler(object):
if not message.is_multipart(): if not message.is_multipart():
if self.is_valid_content_type(message): if self.is_valid_content_type(message):
# TODO: Check encoding (maybe CHARSET) # TODO: Check encoding (maybe CHARSET)
return message.get_payload(decode=True).decode("utf-8") try:
return message.get_payload(decode=True).decode('utf-8')
except UnicodeDecodeError:
print('Error decoding')
return None
else: else:
content_type_to_payload = { content_type_to_payload = {
payload.get_content_type(): self.get_email_text(payload) payload.get_content_type(): self.get_email_text(payload)
@ -81,17 +91,19 @@ class MailCrawler(object):
return response.json() return response.json()
def run(self): def run(self):
print('Starting crawler')
server = self.get_server() server = self.get_server()
server.select_folder('INBOX') server.select_folder('INBOX')
message_ids = server.search(['SINCE', date(2018, 1, 31)]) message_ids = server.search(['SINCE', date(2018, 1, 31)])
for msgid, data in server.fetch(message_ids, 'RFC822').items(): for msgid, data in server.fetch(message_ids, 'RFC822').items():
print('Fetched message with id ', msgid)
email_message = email.message_from_bytes(data[b'RFC822']) email_message = email.message_from_bytes(data[b'RFC822'])
for result in self.parse_message(email_message): for result in self.parse_message(email_message):
result.update({ result.update({
'subject': email_message['SUBJECT'], 'subject': email_message['SUBJECT'],
}) })
print("Parsed result: ", result) print('Parsed result: ', result)
print("Indexed result: ", self.index_message(result)) print('Indexed result: ', self.index_message(result))
if __name__ == '__main__': if __name__ == '__main__':