diff options
Diffstat (limited to 'src/rub/xml.py')
-rw-r--r-- | src/rub/xml.py | 27 |
1 files changed, 24 insertions, 3 deletions
diff --git a/src/rub/xml.py b/src/rub/xml.py index 4c3a2ae..87b5572 100644 --- a/src/rub/xml.py +++ b/src/rub/xml.py @@ -16,18 +16,27 @@ # You should have received a copy of the GNU Affero General Public License # along with rub. If not, see <https://www.gnu.org/licenses/>. +from collections import defaultdict from copy import deepcopy +from functools import cache from pathlib import Path from lxml.builder import E from lxml.html import document_fromstring as from_html -from lxml.etree import QName, XML, XSLT, XSLTExtension, tostring as serialize +from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension, + tostring as serialize) __all__ = ['NS', 'Processor', 'recurse'] NS = 'https://rub.parody' +def serialize_content(element) -> str: + text = element.text.encode() if element.text else b'' + children = b''.join(serialize(deepcopy(i)) for i in element) + return text + children + + def recurse(extension, context, input_node, output_parent): """Apply template recursively on input node.""" output = deepcopy(input_node) @@ -60,7 +69,7 @@ class Evaluator(XSLTExtension): class Serializer(XSLTExtension): def execute(self, context, self_node, input_node, output_parent): - output_parent.text = serialize(deepcopy(input_node)) + output_parent.text = CDATA(serialize_content(input_node)) class Processor: @@ -88,9 +97,21 @@ def gen_omnifeed(sources: list[Path], pages: list[Path], if not desc: continue title = src_root.findtext('title', '', {None: NS}) date = src_root.findtext('date', '', {None: NS}) + categories = src_root.itertext(tag=QName(NS, 'category').text, + with_tail=False) page_root = from_html(page.read_bytes()) path = str(page.relative_to(out_dir)) entries.append(E.entry(E.title(title), E.description(desc), - E.date(date), E.path(path), page_root)) + E.date(date), E.path(path), + *map(E.category, categories), page_root)) dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) dest.write_bytes(serialize(E.feed(*entries), pretty_print=True)) + + +def index_categories(pages: Path) -> dict[str, list[int]]: + """Index categories from generic global feed.""" + index = defaultdict(list) + for i, entry in enumerate(XML(omnifeed.read_bytes())): + for category in entry.itertext(tag='category', with_tail=False): + index[category].append(i) + return index |