about summary refs log tree commit diff homepage
path: root/_libs/formbox/format
diff options
context:
space:
mode:
Diffstat (limited to '_libs/formbox/format')
-rwxr-xr-x_libs/formbox/format71
1 files changed, 71 insertions, 0 deletions
diff --git a/_libs/formbox/format b/_libs/formbox/format
new file mode 100755
index 0000000..39e6132
--- /dev/null
+++ b/_libs/formbox/format
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# Format mbox as HTML/XML
+# Copyright (C) 2021  Nguyễn Gia Phong
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from argparse import ArgumentParser
+from email.header import decode_header
+from email.utils import parsedate_to_datetime
+from functools import partial
+from itertools import starmap
+from mailbox import mbox
+from pathlib import Path
+from urllib.parse import quote
+
+from bleach import clean, linkify
+from markdown import markdown
+
+sanitise = partial(clean, tags=('a', 'code', 'em', 'strong', 'sub', 'sup',
+                                'blockquote', 'p', 'pre', 'ul', 'ol', 'li'),
+                   protocols=('ftp', 'gemini', 'gopher', 'http', 'https',
+                              'irc', 'ircs', 'mailto', 'matrix', 'xmpp'))
+
+
+def extract(archive, parent):
+    for message_id, message in archive.copy().items():
+        # TODO: handle multipart
+        if message['In-Reply-To'] != parent: continue
+        archive.pop(message_id)
+        yield message, extract(archive, message_id)
+
+
+def decode(header):
+    for string, charset in decode_header(header):
+        encoding = 'utf-8' if charset is None else charset
+        yield string.decode(encoding)
+
+
+def render(template, forest, parent):
+    for self, children in forest:
+        message_id = self['Message-Id']
+        date = parsedate_to_datetime(self['Date']).date().isoformat()
+        author, address = decode(self['From'])
+        body = sanitise(linkify(markdown(self.get_payload(),
+                                         output_format='html5')))
+        rendered_children = render(template, children, message_id)
+        yield template.format(message_id=quote(message_id),
+                              date=date, author=author, body=body,
+                              children='\n'.join(rendered_children))
+
+
+parser = ArgumentParser()
+parser.add_argument('mbox')
+parser.add_argument('id')
+parser.add_argument('template', type=Path)
+args = parser.parse_args()
+
+archive = {m['Message-Id']: m for m in mbox(args.mbox)}
+template = args.template.read_text()
+print(*render(template, extract(archive, args.id), args.id), sep='', end='')