diff --git a/.gitignore b/.gitignore index 8418f8e..850fa0b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ # Distribution / packaging .Python +virtualenv_run/ env/ build/ develop-eggs/ diff --git a/crawler/crawler/__init__.py b/crawler/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/crawler/main.py b/crawler/crawler/main.py new file mode 100644 index 0000000..905c35b --- /dev/null +++ b/crawler/crawler/main.py @@ -0,0 +1,52 @@ +from getpass import getpass +from datetime import date +from ipdb import set_trace +import email + +from imapclient import IMAPClient + + +class MailCrawler(object): + server_url = 'my.iamthefij.com' + valid_content_types = [ 'text/plain', 'text/html' ] + + def get_credentials(self): + password = getpass('Password?') + return ('iamthefij@iamthefij.com', password) + + def get_server(self): + server = IMAPClient(self.server_url, use_uid=True) + server.login(*self.get_credentials()) + return server + + def is_valid_content_type(self, message): + return message.get_content_type() in self.valid_content_types + + def get_email_text(self, message): + if not message.is_multipart(): + if self.is_valid_content_type(message): + return message.get_payload(decode=True) + else: + content_type_to_payload = { + payload.get_content_type(): self.get_email_text(payload) + for payload in message.get_payload() + } + for content_type in self.valid_content_types: + text = content_type_to_payload.get(content_type) + if text: + return text + return None + + + def run(self): + server = self.get_server() + server.select_folder('INBOX') + message_ids = server.search(['SINCE', date(2018, 1, 31)]) + for msgid, data in server.fetch(message_ids, 'RFC822').items(): + email_message = email.message_from_bytes(data[b'RFC822']) + set_trace() + print(self.get_email_text(email_message)) + + +if __name__ == '__main__': + MailCrawler().run() diff --git a/crawler/requirements-dev.txt b/crawler/requirements-dev.txt new file mode 100644 index 0000000..988613e --- /dev/null +++ b/crawler/requirements-dev.txt @@ -0,0 +1,2 @@ +ipdb +ipython diff --git a/crawler/requirements.txt b/crawler/requirements.txt new file mode 100644 index 0000000..4de4db6 --- /dev/null +++ b/crawler/requirements.txt @@ -0,0 +1 @@ +imapclient