src/rub/xml.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

# XML processing abstractions
# Copyright (C) 2023  Nguyễn Gia Phong
#
# This file is part of rub.
#
# Rub is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Rub is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with rub.  If not, see <https://www.gnu.org/licenses/>.

from collections import defaultdict
from copy import deepcopy
from functools import cache
from pathlib import Path

from lxml.builder import E
from lxml.html import document_fromstring as from_html
from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension,
                        tostring as serialize)

__all__ = ['NS', 'Processor', 'recurse']

NS = 'https://rub.parody'


def serialize_content(element) -> str:
    text = element.text.encode() if element.text else b''
    children = b''.join(serialize(deepcopy(i)) for i in element)
    return text + children


def recurse(extension, context, input_node, output_parent):
    """Apply template recursively on input node."""
    output = deepcopy(input_node)
    for i in output: output.remove(i)
    for i in input_node:
        for j in extension.apply_templates(context, i):
            if not isinstance(j, str):
                output.append(deepcopy(j))
            elif len(output) == 0:
                if output.text is None:
                    output.text = j
                else:
                    output.text += j
            elif output[-1].tail is None:
                output[-1].tail = j
            else:
                output[-1].tail += j
    output_parent.append(output)


class Evaluator(XSLTExtension):
    def __init__(self, **handlers):
        self.handlers = {QName(NS, k).text: v for k, v in handlers.items()}
        super().__init__()

    def execute(self, context, self_node, input_node, output_parent):
        handle = self.handlers.get(input_node.tag, recurse)
        handle(self, context, input_node, output_parent)


class Serializer(XSLTExtension):
    def execute(self, context, self_node, input_node, output_parent):
        output_parent.text = CDATA(serialize_content(input_node))


class Processor:
    """Callable XSLT processor."""

    def __init__(self, xslt: Path, change_name, **handlers) -> None:
        self.xslt, self.change_name = xslt, change_name
        stylesheet = xslt.read_bytes()
        extensions = {(NS, 'eval'): Evaluator(**handlers),
                      (NS, 'serialize'): Serializer()}
        self.transform = XSLT(XML(stylesheet), extensions=extensions)

    def process(self, src: Path, dest: Path) -> None:
        dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
        dest.write_text(str(self.transform(XML(src.read_bytes()))))


def gen_omnifeed(sources: list[Path], pages: list[Path],
                 out_dir: Path, dest: Path) -> None:
    """Generate generic global feed."""
    entries = []
    for src, page in zip(sources, pages):
        src_root = XML(src.read_bytes())
        desc = src_root.findtext('description', '', {None: NS})
        if not desc: continue
        title = src_root.findtext('title', '', {None: NS})
        date = src_root.findtext('date', '', {None: NS})
        categories = src_root.itertext(tag=QName(NS, 'category').text,
                                       with_tail=False)
        page_root = from_html(page.read_bytes())
        path = str(page.relative_to(out_dir))
        entries.append(E.entry(E.title(title), E.description(desc),
                               E.date(date), E.path(path),
                               *map(E.category, categories), page_root))
    dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
    dest.write_bytes(serialize(E.feed(*entries), pretty_print=True))


def index_categories(pages: Path) -> dict[str, list[int]]:
    """Index categories from generic global feed."""
    index = defaultdict(list)
    for i, entry in enumerate(XML(omnifeed.read_bytes())):
        for category in entry.itertext(tag='category', with_tail=False):
            index[category].append(i)
    return index