summary refs log tree commit diff
path: root/src/formbox.py
blob: 39e61320292cba3e8fb6a9d942345660fc719f3b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# Format mbox as HTML/XML
# Copyright (C) 2021  Nguyễn Gia Phong
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from argparse import ArgumentParser
from email.header import decode_header
from email.utils import parsedate_to_datetime
from functools import partial
from itertools import starmap
from mailbox import mbox
from pathlib import Path
from urllib.parse import quote

from bleach import clean, linkify
from markdown import markdown

sanitise = partial(clean, tags=('a', 'code', 'em', 'strong', 'sub', 'sup',
                                'blockquote', 'p', 'pre', 'ul', 'ol', 'li'),
                   protocols=('ftp', 'gemini', 'gopher', 'http', 'https',
                              'irc', 'ircs', 'mailto', 'matrix', 'xmpp'))


def extract(archive, parent):
    for message_id, message in archive.copy().items():
        # TODO: handle multipart
        if message['In-Reply-To'] != parent: continue
        archive.pop(message_id)
        yield message, extract(archive, message_id)


def decode(header):
    for string, charset in decode_header(header):
        encoding = 'utf-8' if charset is None else charset
        yield string.decode(encoding)


def render(template, forest, parent):
    for self, children in forest:
        message_id = self['Message-Id']
        date = parsedate_to_datetime(self['Date']).date().isoformat()
        author, address = decode(self['From'])
        body = sanitise(linkify(markdown(self.get_payload(),
                                         output_format='html5')))
        rendered_children = render(template, children, message_id)
        yield template.format(message_id=quote(message_id),
                              date=date, author=author, body=body,
                              children='\n'.join(rendered_children))


parser = ArgumentParser()
parser.add_argument('mbox')
parser.add_argument('id')
parser.add_argument('template', type=Path)
args = parser.parse_args()

archive = {m['Message-Id']: m for m in mbox(args.mbox)}
template = args.template.read_text()
print(*render(template, extract(archive, args.id), args.id), sep='', end='')