From 9710c6acdf63d866e44e1f77d3bd5c578f329ecb Mon Sep 17 00:00:00 2001 From: Nguyễn Gia Phong Date: Fri, 7 Jan 2022 21:58:14 +0700 Subject: Improve extraction performance Yes I'm micro-optimizing, gotta go fast. --- src/formbox.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/formbox.py b/src/formbox.py index 16c0221..84c778a 100644 --- a/src/formbox.py +++ b/src/formbox.py @@ -16,6 +16,7 @@ # along with this program. If not, see . from argparse import ArgumentParser +from collections import defaultdict from email.header import decode_header from email.utils import parsedate_to_datetime from functools import partial @@ -32,15 +33,6 @@ sanitise = partial(clean, tags=('a', 'code', 'em', 'strong', 'sub', 'sup', 'irc', 'ircs', 'mailto', 'matrix', 'xmpp')) -def extract(archive, parent): - """Recursively extract emails in reply to given message ID.""" - for message_id, message in archive.copy().items(): - # TODO: handle multipart - if message['In-Reply-To'] != parent: continue - archive.pop(message_id) - yield message, extract(archive, message_id) - - def decode(header): """Return the decoded email header.""" for string, charset in decode_header(header): @@ -65,17 +57,18 @@ def date(message): return parsedate_to_datetime(message['Date']).date() -def render(template, forest, parent): +def render(template, archive, parent): """Render the thread recursively based on given template.""" - for self, children in forest: + for self in archive[parent]: message_id = self['Message-Id'] try: author, address = decode(self['From']) except ValueError: author = self['From'] + # TODO: handle multipart body = sanitise(linkify(markdown(self.get_payload(), output_format='html5'))) - rendered_children = render(template, children, message_id) + rendered_children = render(template, archive, message_id) yield template.format(message_id=quote(message_id), mailto_params=urlencode(dict(reply_to(self))), date=date(self).isoformat(), author=author, @@ -90,10 +83,11 @@ def main(): parser.add_argument('template', type=Path, help='path to template') args = parser.parse_args() - archive = {m['Message-Id']: m for m in sorted(mbox(args.mbox), key=date)} + archive = defaultdict(list) + for message in sorted(mbox(args.mbox), key=date): + archive[message['In-Reply-To']].append(message) template = args.template.read_text() - print(*render(template, extract(archive, args.id), args.id), - sep='', end='') + print(*render(template, archive, args.id), sep='', end='') if __name__ == '__main__': main() -- cgit 1.4.1