From 1fec9d98a084106866d842d10f03dde9ff5472ee Mon Sep 17 00:00:00 2001 From: Nguyễn Gia Phong Date: Thu, 30 Mar 2023 03:06:23 +0900 Subject: Cache metadata --- src/rub/__init__.py | 44 ++++++++++++++++++------------------- src/rub/xml.py | 62 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/rub/__init__.py b/src/rub/__init__.py index b80b00c..a569dae 100644 --- a/src/rub/__init__.py +++ b/src/rub/__init__.py @@ -17,15 +17,14 @@ # along with rub. If not, see . from functools import cached_property -from json import dump as write_json, load as read_json from os import walk from pathlib import Path from shutil import copytree, rmtree from typing import Iterator -from doit import create_after, run as do +from doit import run as do -from rub.xml import Processor, gen_omnifeed, index_categories +from rub.xml import Processor, gen_metadata, gen_omnifeed __all__ = ['rub'] @@ -75,44 +74,45 @@ class Rubber: def sources(self) -> list[Path]: return glob_files(self.src, '.xml') + @cached_property + def metadata(self) -> Path: + return self.cache / 'metadata.xml' + @cached_property def page_tasks(self) -> list[dict]: return [processing_task(self.page_proc, path, self.src, self.out, f'process {path} into a web page') for path in self.sources] + @cached_property + def pages(self) -> list[Path]: + for task in self.page_tasks: + assert len(task['targets']) == 1 + return [task['targets'][0] for task in self.page_tasks] + + def task_metadata(self) -> dict: + sources = [self.src/path for path in self.sources] + return {'doc': 'extract metadata from source pages', + 'file_dep': sources, + 'actions': [(gen_metadata, [sources, self.pages, self.out, + self.metadata])], + 'targets': [self.metadata], 'clean': True} + def task_pages(self) -> Iterator[dict]: yield {'name': None, 'doc': 'process sources into web pages'} yield from self.page_tasks def task_global_feed(self) -> dict: - sources = [self.src/path for path in self.sources] - for task in self.page_tasks: assert len(task['targets']) == 1 - pages = [task['targets'][0] for task in self.page_tasks] src = self.cache / OMNIFEED task = processing_task(self.feed_proc, OMNIFEED, self.cache, self.out, 'generate global feed') file_dep = (file for file in task['file_dep'] if file != src) return {'doc': task['doc'], - 'file_dep': [*sources, *pages, *file_dep], - 'actions': [(gen_omnifeed, [sources, pages, self.out, src]), + 'file_dep': [self.metadata, *self.pages, *file_dep], + 'actions': [(gen_omnifeed, [self.metadata, self.out, src]), *task['actions']], 'targets': [src, *task['targets']], 'clean': True} - @create_after(executed='global_feed') - def task_categories(self) -> Iterator[dict]: - yield {'name': None, - 'doc': 'generate web page and feed for each category'} - omnifeed, index = self.cache / OMNIFEED, self.cache / 'categories.json' - - def write_index(): - with open(index, 'w') as f: - write_json(index_categories(omnifeed), f) - - yield {'name': 'index', 'doc': 'index categories', - 'file_dep': [omnifeed], 'actions': [write_index], - 'targets': [index], 'clean': True} - def rub(page_proc: Processor, feed_proc: Processor, base: Path, src: Path, cache: Path, out: Path) -> None: diff --git a/src/rub/xml.py b/src/rub/xml.py index 87b5572..2db5b43 100644 --- a/src/rub/xml.py +++ b/src/rub/xml.py @@ -16,19 +16,30 @@ # You should have received a copy of the GNU Affero General Public License # along with rub. If not, see . -from collections import defaultdict from copy import deepcopy -from functools import cache from pathlib import Path from lxml.builder import E -from lxml.html import document_fromstring as from_html -from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension, - tostring as serialize) +from lxml.etree import (CDATA, QName, Resolver, XML, XMLParser, + XSLT, XSLTExtension, tostring as serialize) __all__ = ['NS', 'Processor', 'recurse'] NS = 'https://rub.parody' +GEN_OMNIFEED = ''' + + + + + + + + + + + + +''' def serialize_content(element) -> str: @@ -40,7 +51,8 @@ def serialize_content(element) -> str: def recurse(extension, context, input_node, output_parent): """Apply template recursively on input node.""" output = deepcopy(input_node) - for i in output: output.remove(i) + for i in output: + output.remove(i) for i in input_node: for j in extension.apply_templates(context, i): if not isinstance(j, str): @@ -87,31 +99,43 @@ class Processor: dest.write_text(str(self.transform(XML(src.read_bytes())))) -def gen_omnifeed(sources: list[Path], pages: list[Path], +def gen_metadata(sources: list[Path], pages: list[Path], out_dir: Path, dest: Path) -> None: - """Generate generic global feed.""" + """Extract metadata from all source pages.""" entries = [] for src, page in zip(sources, pages): src_root = XML(src.read_bytes()) desc = src_root.findtext('description', '', {None: NS}) - if not desc: continue + if not desc: + continue title = src_root.findtext('title', '', {None: NS}) date = src_root.findtext('date', '', {None: NS}) categories = src_root.itertext(tag=QName(NS, 'category').text, with_tail=False) - page_root = from_html(page.read_bytes()) path = str(page.relative_to(out_dir)) entries.append(E.entry(E.title(title), E.description(desc), - E.date(date), E.path(path), - *map(E.category, categories), page_root)) + E.date(date), *map(E.category, categories), + E.path(path))) dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) dest.write_bytes(serialize(E.feed(*entries), pretty_print=True)) -def index_categories(pages: Path) -> dict[str, list[int]]: - """Index categories from generic global feed.""" - index = defaultdict(list) - for i, entry in enumerate(XML(omnifeed.read_bytes())): - for category in entry.itertext(tag='category', with_tail=False): - index[category].append(i) - return index +class PageResolver(Resolver): + """URI resolver for use in XSLT document function.""" + + def __init__(self, base: Path) -> None: + self.base = base + super().__init__() + + def resolve(self, path, public_id, context): + return self.resolve_filename(str(self.base/path), context) + + +def gen_omnifeed(metadata: Path, out_dir: Path, dest: Path) -> None: + """Generate generic global feed.""" + parser = XMLParser() + parser.resolvers.add(PageResolver(out_dir)) + transform = XSLT(XML(GEN_OMNIFEED, parser)) + omnifeed = transform(XML(metadata.read_bytes())) + dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) + dest.write_bytes(serialize(omnifeed, pretty_print=True)) -- cgit 1.4.1