1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
# XML processing abstractions
# Copyright (C) 2023 Nguyễn Gia Phong
#
# This file is part of rub.
#
# Rub is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Rub is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with rub. If not, see <https://www.gnu.org/licenses/>.
from collections import defaultdict
from copy import deepcopy
from functools import cache
from pathlib import Path
from lxml.builder import E
from lxml.html import document_fromstring as from_html
from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension,
tostring as serialize)
__all__ = ['NS', 'Processor', 'recurse']
NS = 'https://rub.parody'
def serialize_content(element) -> str:
text = element.text.encode() if element.text else b''
children = b''.join(serialize(deepcopy(i)) for i in element)
return text + children
def recurse(extension, context, input_node, output_parent):
"""Apply template recursively on input node."""
output = deepcopy(input_node)
for i in output: output.remove(i)
for i in input_node:
for j in extension.apply_templates(context, i):
if not isinstance(j, str):
output.append(deepcopy(j))
elif len(output) == 0:
if output.text is None:
output.text = j
else:
output.text += j
elif output[-1].tail is None:
output[-1].tail = j
else:
output[-1].tail += j
output_parent.append(output)
class Evaluator(XSLTExtension):
def __init__(self, **handlers):
self.handlers = {QName(NS, k).text: v for k, v in handlers.items()}
super().__init__()
def execute(self, context, self_node, input_node, output_parent):
handle = self.handlers.get(input_node.tag, recurse)
handle(self, context, input_node, output_parent)
class Serializer(XSLTExtension):
def execute(self, context, self_node, input_node, output_parent):
output_parent.text = CDATA(serialize_content(input_node))
class Processor:
"""Callable XSLT processor."""
def __init__(self, xslt: Path, change_name, **handlers) -> None:
self.xslt, self.change_name = xslt, change_name
stylesheet = xslt.read_bytes()
extensions = {(NS, 'eval'): Evaluator(**handlers),
(NS, 'serialize'): Serializer()}
self.transform = XSLT(XML(stylesheet), extensions=extensions)
def process(self, src: Path, dest: Path) -> None:
dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
dest.write_text(str(self.transform(XML(src.read_bytes()))))
def gen_omnifeed(sources: list[Path], pages: list[Path],
out_dir: Path, dest: Path) -> None:
"""Generate generic global feed."""
entries = []
for src, page in zip(sources, pages):
src_root = XML(src.read_bytes())
desc = src_root.findtext('description', '', {None: NS})
if not desc: continue
title = src_root.findtext('title', '', {None: NS})
date = src_root.findtext('date', '', {None: NS})
categories = src_root.itertext(tag=QName(NS, 'category').text,
with_tail=False)
page_root = from_html(page.read_bytes())
path = str(page.relative_to(out_dir))
entries.append(E.entry(E.title(title), E.description(desc),
E.date(date), E.path(path),
*map(E.category, categories), page_root))
dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
dest.write_bytes(serialize(E.feed(*entries), pretty_print=True))
def index_categories(pages: Path) -> dict[str, list[int]]:
"""Index categories from generic global feed."""
index = defaultdict(list)
for i, entry in enumerate(XML(omnifeed.read_bytes())):
for category in entry.itertext(tag='category', with_tail=False):
index[category].append(i)
return index
|