From e22de4a654fe5c93670fa9ac8098921f577074cd Mon Sep 17 00:00:00 2001 From: Nguyễn Gia Phong Date: Mon, 27 Mar 2023 22:43:14 +0900 Subject: Begin poking on category-specific feeds and pages --- .gitignore | 1 - src/rub/__init__.py | 53 ++++++++++++++++++++++++++++++++++++----------------- src/rub/xml.py | 27 ++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 48698d8..1b39d39 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ __pycache__/ dist/ src/* -!src/rub/ diff --git a/src/rub/__init__.py b/src/rub/__init__.py index 3be303b..b80b00c 100644 --- a/src/rub/__init__.py +++ b/src/rub/__init__.py @@ -17,14 +17,15 @@ # along with rub. If not, see . from functools import cached_property +from json import dump as write_json, load as read_json from os import walk from pathlib import Path from shutil import copytree, rmtree from typing import Iterator -from doit import run as do +from doit import create_after, run as do -from rub.xml import Processor, gen_omnifeed +from rub.xml import Processor, gen_omnifeed, index_categories __all__ = ['rub'] @@ -74,25 +75,43 @@ class Rubber: def sources(self) -> list[Path]: return glob_files(self.src, '.xml') + @cached_property + def page_tasks(self) -> list[dict]: + return [processing_task(self.page_proc, path, self.src, self.out, + f'process {path} into a web page') + for path in self.sources] + def task_pages(self) -> Iterator[dict]: yield {'name': None, 'doc': 'process sources into web pages'} - for path in self.sources: - yield processing_task(self.page_proc, path, self.src, self.out, - f'process {path} into a web page') + yield from self.page_tasks - def task_feeds(self) -> Iterator[dict]: - yield {'name': None, 'doc': 'generate web feeds'} - feed_src = self.cache / OMNIFEED + def task_global_feed(self) -> dict: sources = [self.src/path for path in self.sources] - pages = [self.page_proc.change_name(self.out/path) - for path in self.sources] - yield {'name': 'source', 'doc': 'generate generic global feed', - 'file_dep': sources+pages, - 'actions': [(gen_omnifeed, - [sources, pages, self.out, feed_src])], - 'targets': [feed_src], 'clean': True} - yield processing_task(self.feed_proc, OMNIFEED, self.cache, self.out, - 'generate global feed') + for task in self.page_tasks: assert len(task['targets']) == 1 + pages = [task['targets'][0] for task in self.page_tasks] + src = self.cache / OMNIFEED + task = processing_task(self.feed_proc, OMNIFEED, self.cache, self.out, + 'generate global feed') + file_dep = (file for file in task['file_dep'] if file != src) + return {'doc': task['doc'], + 'file_dep': [*sources, *pages, *file_dep], + 'actions': [(gen_omnifeed, [sources, pages, self.out, src]), + *task['actions']], + 'targets': [src, *task['targets']], 'clean': True} + + @create_after(executed='global_feed') + def task_categories(self) -> Iterator[dict]: + yield {'name': None, + 'doc': 'generate web page and feed for each category'} + omnifeed, index = self.cache / OMNIFEED, self.cache / 'categories.json' + + def write_index(): + with open(index, 'w') as f: + write_json(index_categories(omnifeed), f) + + yield {'name': 'index', 'doc': 'index categories', + 'file_dep': [omnifeed], 'actions': [write_index], + 'targets': [index], 'clean': True} def rub(page_proc: Processor, feed_proc: Processor, diff --git a/src/rub/xml.py b/src/rub/xml.py index 4c3a2ae..87b5572 100644 --- a/src/rub/xml.py +++ b/src/rub/xml.py @@ -16,18 +16,27 @@ # You should have received a copy of the GNU Affero General Public License # along with rub. If not, see . +from collections import defaultdict from copy import deepcopy +from functools import cache from pathlib import Path from lxml.builder import E from lxml.html import document_fromstring as from_html -from lxml.etree import QName, XML, XSLT, XSLTExtension, tostring as serialize +from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension, + tostring as serialize) __all__ = ['NS', 'Processor', 'recurse'] NS = 'https://rub.parody' +def serialize_content(element) -> str: + text = element.text.encode() if element.text else b'' + children = b''.join(serialize(deepcopy(i)) for i in element) + return text + children + + def recurse(extension, context, input_node, output_parent): """Apply template recursively on input node.""" output = deepcopy(input_node) @@ -60,7 +69,7 @@ class Evaluator(XSLTExtension): class Serializer(XSLTExtension): def execute(self, context, self_node, input_node, output_parent): - output_parent.text = serialize(deepcopy(input_node)) + output_parent.text = CDATA(serialize_content(input_node)) class Processor: @@ -88,9 +97,21 @@ def gen_omnifeed(sources: list[Path], pages: list[Path], if not desc: continue title = src_root.findtext('title', '', {None: NS}) date = src_root.findtext('date', '', {None: NS}) + categories = src_root.itertext(tag=QName(NS, 'category').text, + with_tail=False) page_root = from_html(page.read_bytes()) path = str(page.relative_to(out_dir)) entries.append(E.entry(E.title(title), E.description(desc), - E.date(date), E.path(path), page_root)) + E.date(date), E.path(path), + *map(E.category, categories), page_root)) dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True) dest.write_bytes(serialize(E.feed(*entries), pretty_print=True)) + + +def index_categories(pages: Path) -> dict[str, list[int]]: + """Index categories from generic global feed.""" + index = defaultdict(list) + for i, entry in enumerate(XML(omnifeed.read_bytes())): + for category in entry.itertext(tag='category', with_tail=False): + index[category].append(i) + return index -- cgit 1.4.1