Use imbox instead
This commit is contained in:
parent
859f0e040e
commit
814f57a2e4
@ -1,10 +1,12 @@
|
|||||||
from getpass import getpass
|
|
||||||
from datetime import date
|
from datetime import date
|
||||||
import json
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
from getpass import getpass
|
||||||
import email
|
import email
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from imapclient import IMAPClient
|
from imbox import Imbox
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
@ -45,7 +47,7 @@ class MailCrawler(object):
|
|||||||
response = requests.post(
|
response = requests.post(
|
||||||
parser_host+'/parse',
|
parser_host+'/parse',
|
||||||
json={
|
json={
|
||||||
'subject': message['SUBJECT'],
|
'subject': message.subject,
|
||||||
'message': text,
|
'message': text,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@ -55,32 +57,19 @@ class MailCrawler(object):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def get_server(self):
|
def get_server(self):
|
||||||
server = IMAPClient(self.imap_url, use_uid=True)
|
return Imbox(
|
||||||
server.login(self.imap_user, self.imap_pass)
|
self.imap_url,
|
||||||
return server
|
username=self.imap_user,
|
||||||
|
password=self.imap_pass,
|
||||||
def is_valid_content_type(self, message):
|
ssl=True,
|
||||||
return message.get_content_type() in VALID_CONTENT_TYPES
|
)
|
||||||
|
|
||||||
def get_email_text(self, message):
|
def get_email_text(self, message):
|
||||||
if not message.is_multipart():
|
body = message.body.get('plain') or message.body.get('html')
|
||||||
if self.is_valid_content_type(message):
|
if not body:
|
||||||
# TODO: Check encoding (maybe CHARSET)
|
return None
|
||||||
try:
|
# Concat all known body content together since it doesn't really matter
|
||||||
return message.get_payload(decode=True).decode('utf-8')
|
return ''.join([text for text in body if isinstance(text, str)])
|
||||||
except UnicodeDecodeError:
|
|
||||||
print('Error decoding')
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
content_type_to_payload = {
|
|
||||||
payload.get_content_type(): self.get_email_text(payload)
|
|
||||||
for payload in message.get_payload()
|
|
||||||
}
|
|
||||||
for content_type in VALID_CONTENT_TYPES:
|
|
||||||
text = content_type_to_payload.get(content_type)
|
|
||||||
if text:
|
|
||||||
return text
|
|
||||||
return None
|
|
||||||
|
|
||||||
def index_message(self, message):
|
def index_message(self, message):
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
@ -92,18 +81,22 @@ class MailCrawler(object):
|
|||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
print('Starting crawler')
|
print('Starting crawler')
|
||||||
server = self.get_server()
|
|
||||||
server.select_folder('INBOX')
|
with self.get_server() as server:
|
||||||
message_ids = server.search(['SINCE', date(2018, 1, 31)])
|
since_date = datetime.now() - timedelta(days=30)
|
||||||
for msgid, data in server.fetch(message_ids, 'RFC822').items():
|
for uid, message in server.messages(date__gt=since_date):
|
||||||
print('Fetched message with id ', msgid)
|
print(
|
||||||
email_message = email.message_from_bytes(data[b'RFC822'])
|
'Processing message uid {} message_id {} '
|
||||||
for result in self.parse_message(email_message):
|
'with subject "{}"'.format(
|
||||||
result.update({
|
uid, message.message_id, message.subject
|
||||||
'subject': email_message['SUBJECT'],
|
)
|
||||||
})
|
)
|
||||||
print('Parsed result: ', result)
|
for result in self.parse_message(message):
|
||||||
print('Indexed result: ', self.index_message(result))
|
result.update({
|
||||||
|
'subject': message.subject,
|
||||||
|
})
|
||||||
|
print('Parsed result: ', result)
|
||||||
|
print('Indexed result: ', self.index_message(result))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
imapclient
|
imbox
|
||||||
requests
|
requests
|
||||||
|
Loading…
Reference in New Issue
Block a user