about summary refs log tree commit diff
path: root/src/formbox.py
blob: 5e30faf926c4e5e7e9f2321cdb48ab806d2175cd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# Format mbox as HTML/XML
# Copyright (C) 2021-2022  Nguyễn Gia Phong
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from argparse import ArgumentParser
from collections import defaultdict
from email.header import decode_header
from email.utils import parsedate_to_datetime
from functools import partial
from mailbox import mbox
from pathlib import Path
from urllib.parse import quote, unquote, urlencode

from bleach import clean, linkify
from markdown import markdown

sanitise = partial(clean, tags=('a', 'code', 'em', 'strong', 'sub', 'sup',
                                'blockquote', 'p', 'pre', 'ul', 'ol', 'li'),
                   protocols=('ftp', 'gemini', 'gopher', 'http', 'https',
                              'irc', 'ircs', 'mailto', 'matrix', 'xmpp'))


def get_body(message):
    """Return the Markdown message body converted to HTML."""
    if message.is_multipart():
        for payload in map(get_body, message.get_payload()):
            if payload is not None: return payload
    elif message.get_content_type() in ('text/markdown', 'text/plain'):
        return sanitise(linkify(markdown(message.get_payload(decode=True),
                                         output_format='html5')))
    return None


def decode(header):
    """Return the decoded email header."""
    for string, charset in decode_header(header):
        encoding = 'utf-8' if charset is None else charset
        yield string.decode(encoding)


def reply_to(message):
    """Return mailto parameters for replying to the given email."""
    yield 'In-Reply-To', message['Message-ID']
    yield 'Cc', message.get('Reply-To', message['From'])
    subject = message['Subject']
    if subject is None: return
    if subject.lower().startswith('re:'):
        yield 'Subject', subject
    else:
        yield 'Subject', f'Re: {subject}'


def date(message):
    """Parse given email's Date header."""
    return parsedate_to_datetime(message['Date']).date()


def render(template, archive, parent):
    """Render the thread recursively based on given template."""
    for self in sorted(archive[parent], key=date):
        body = get_body(self)
        if body is None: continue
        message_id = self['Message-Id']
        # Please don't have space in email addresses
        author = ' '.join(decode(self['From'])).rsplit(maxsplit=1)[0]
        rendered_children = render(template, archive, message_id)
        yield template.format(message_id=quote(message_id),
                              mailto_params=urlencode(dict(reply_to(self))),
                              date=date(self).isoformat(), author=author,
                              body=body, children='\n'.join(rendered_children))


def main():
    """Parse command-line arguments and pass them to routines."""
    parser = ArgumentParser(description='format mbox as HTML/XML')
    parser.add_argument('mbox', type=mbox, help='path to mbox file')
    parser.add_argument('id', type=unquote, help='root message ID')
    parser.add_argument('template', type=Path, help='path to template')
    args = parser.parse_args()

    archive = defaultdict(list)
    for message in args.mbox: archive[message['In-Reply-To']].append(message)
    template = args.template.read_text()
    print(*render(template, archive, args.id), sep='', end='')


if __name__ == '__main__': main()