diff options
Diffstat (limited to 'src/rub/xml.py')
-rw-r--r-- | src/rub/xml.py | 62 |
1 files changed, 43 insertions, 19 deletions
diff --git a/src/rub/xml.py b/src/rub/xml.py index 87b5572..2db5b43 100644 --- a/src/rub/xml.py +++ b/src/rub/xml.py @@ -16,19 +16,30 @@ # You should have received a copy of the GNU Affero General Public License # along with rub. If not, see <https://www.gnu.org/licenses/>. -from collections import defaultdict from copy import deepcopy -from functools import cache from pathlib import Path from lxml.builder import E -from lxml.html import document_fromstring as from_html -from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension, - tostring as serialize) +from lxml.etree import (CDATA, QName, Resolver, XML, XMLParser, + XSLT, XSLTExtension, tostring as serialize) __all__ = ['NS', 'Processor', 'recurse'] NS = 'https://rub.parody' +GEN_OMNIFEED = '''<?xml version="1.0"?> +<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> + <xsl:output method="xml" indent="yes" encoding="UTF-8"/> + <xsl:template match="@*|node()"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + </xsl:template> + <xsl:template match="path"> + <xsl:copy-of select="."/> + <xsl:copy-of select="document(.)"/> + </xsl:template> +</xsl:stylesheet> +''' def serialize_content(element) -> str: @@ -40,7 +51,8 @@ def serialize_content(element) -> str: def recurse(extension, context, input_node, output_parent): """Apply template recursively on input node.""" output = deepcopy(input_node) - for i in output: output.remove(i) + for i in output: + output.remove(i) for i in input_node: for j in extension.apply_templates(context, i): if not isinstance(j, str): @@ -87,31 +99,43 @@ class Processor: dest.write_text(str(self.transform(XML(src.read_bytes())))) -def gen_omnifeed(sources: list[Path], pages: list[Path], +def gen_metadata(sources: list[Path], pages: list[Path], out_dir: Path, dest: Path) -> None: - """Generate generic global feed.""" + """Extract metadata from all source pages.""" entries = [] for src, page in zip(sources, pages): src_root = XML(src.read_bytes()) desc = src_root.findtext('description', '', {None: NS}) - if not desc: continue + if not desc: + continue title = src_root.findtext('title', '', {None: NS}) date = src_root.findtext('date', '', {None: NS}) categories = src_root.itertext(tag=QName(NS, 'category').text, with_tail=False) - page_root = from_html(page.read_bytes()) path = str(page.relative_to(out_dir)) entries.append(E.entry(E.title(title), E.description(desc), - E.date(date), E.path(path), - *map(E.category, categories), page_root)) + E.date(date), *map(E.category, categories), + E.path(path))) dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) dest.write_bytes(serialize(E.feed(*entries), pretty_print=True)) -def index_categories(pages: Path) -> dict[str, list[int]]: - """Index categories from generic global feed.""" - index = defaultdict(list) - for i, entry in enumerate(XML(omnifeed.read_bytes())): - for category in entry.itertext(tag='category', with_tail=False): - index[category].append(i) - return index +class PageResolver(Resolver): + """URI resolver for use in XSLT document function.""" + + def __init__(self, base: Path) -> None: + self.base = base + super().__init__() + + def resolve(self, path, public_id, context): + return self.resolve_filename(str(self.base/path), context) + + +def gen_omnifeed(metadata: Path, out_dir: Path, dest: Path) -> None: + """Generate generic global feed.""" + parser = XMLParser() + parser.resolvers.add(PageResolver(out_dir)) + transform = XSLT(XML(GEN_OMNIFEED, parser)) + omnifeed = transform(XML(metadata.read_bytes())) + dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) + dest.write_bytes(serialize(omnifeed, pretty_print=True)) |