Begin poking on category-specific feeds and pages

author: Nguyễn Gia Phong <mcsinyx@disroot.org> 2023-03-27 22:43:14 +0900
committer: Nguyễn Gia Phong <mcsinyx@disroot.org> 2023-03-27 22:43:14 +0900
commit: e22de4a654fe5c93670fa9ac8098921f577074cd (patch)
tree: 97fc5494da7bd1c87c20aca6d3f285bba499d145
parent: abe85863371957151701c2f41739495d02611c6f (diff)
download: rub-e22de4a654fe5c93670fa9ac8098921f577074cd.tar.gz
3 files changed, 60 insertions, 21 deletions
diff --git a/.gitignore b/.gitignore
index 48698d8..1b39d39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 __pycache__/
 dist/
 src/*
-!src/rub/
diff --git a/src/rub/__init__.py b/src/rub/__init__.py
index 3be303b..b80b00c 100644
--- a/src/rub/__init__.py
+++ b/src/rub/__init__.py
@@ -17,14 +17,15 @@
 # along with rub.  If not, see <https://www.gnu.org/licenses/>.
 
 from functools import cached_property
+from json import dump as write_json, load as read_json
 from os import walk
 from pathlib import Path
 from shutil import copytree, rmtree
 from typing import Iterator
 
-from doit import run as do
+from doit import create_after, run as do
 
-from rub.xml import Processor, gen_omnifeed
+from rub.xml import Processor, gen_omnifeed, index_categories
 
 __all__ = ['rub']
 
@@ -74,25 +75,43 @@ class Rubber:
     def sources(self) -> list[Path]:
         return glob_files(self.src, '.xml')
 
+    @cached_property
+    def page_tasks(self) -> list[dict]:
+        return [processing_task(self.page_proc, path, self.src, self.out,
+                                f'process {path} into a web page')
+                for path in self.sources]
+
     def task_pages(self) -> Iterator[dict]:
         yield {'name': None, 'doc': 'process sources into web pages'}
-        for path in self.sources:
-            yield processing_task(self.page_proc, path, self.src, self.out,
-                                  f'process {path} into a web page')
+        yield from self.page_tasks
 
-    def task_feeds(self) -> Iterator[dict]:
-        yield {'name': None, 'doc': 'generate web feeds'}
-        feed_src = self.cache / OMNIFEED
+    def task_global_feed(self) -> dict:
         sources = [self.src/path for path in self.sources]
-        pages = [self.page_proc.change_name(self.out/path)
-                 for path in self.sources]
-        yield {'name': 'source', 'doc': 'generate generic global feed',
-               'file_dep': sources+pages,
-               'actions': [(gen_omnifeed,
-                            [sources, pages, self.out, feed_src])],
-               'targets': [feed_src], 'clean': True}
-        yield processing_task(self.feed_proc, OMNIFEED, self.cache, self.out,
-                              'generate global feed')
+        for task in self.page_tasks: assert len(task['targets']) == 1
+        pages = [task['targets'][0] for task in self.page_tasks]
+        src = self.cache / OMNIFEED
+        task = processing_task(self.feed_proc, OMNIFEED, self.cache, self.out,
+                               'generate global feed')
+        file_dep = (file for file in task['file_dep'] if file != src)
+        return {'doc': task['doc'],
+                'file_dep': [*sources, *pages, *file_dep],
+                'actions': [(gen_omnifeed, [sources, pages, self.out, src]),
+                            *task['actions']],
+                'targets': [src, *task['targets']], 'clean': True}
+
+    @create_after(executed='global_feed')
+    def task_categories(self) -> Iterator[dict]:
+        yield {'name': None,
+               'doc': 'generate web page and feed for each category'}
+        omnifeed, index = self.cache / OMNIFEED, self.cache / 'categories.json'
+
+        def write_index():
+            with open(index, 'w') as f:
+                write_json(index_categories(omnifeed), f)
+
+        yield {'name': 'index', 'doc': 'index categories',
+               'file_dep': [omnifeed], 'actions': [write_index],
+               'targets': [index], 'clean': True}
 
 
 def rub(page_proc: Processor, feed_proc: Processor,
diff --git a/src/rub/xml.py b/src/rub/xml.py
index 4c3a2ae..87b5572 100644
--- a/src/rub/xml.py
+++ b/src/rub/xml.py
@@ -16,18 +16,27 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with rub.  If not, see <https://www.gnu.org/licenses/>.
 
+from collections import defaultdict
 from copy import deepcopy
+from functools import cache
 from pathlib import Path
 
 from lxml.builder import E
 from lxml.html import document_fromstring as from_html
-from lxml.etree import QName, XML, XSLT, XSLTExtension, tostring as serialize
+from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension,
+                        tostring as serialize)
 
 __all__ = ['NS', 'Processor', 'recurse']
 
 NS = 'https://rub.parody'
 
 
+def serialize_content(element) -> str:
+    text = element.text.encode() if element.text else b''
+    children = b''.join(serialize(deepcopy(i)) for i in element)
+    return text + children
+
+
 def recurse(extension, context, input_node, output_parent):
     """Apply template recursively on input node."""
     output = deepcopy(input_node)
@@ -60,7 +69,7 @@ class Evaluator(XSLTExtension):
 
 class Serializer(XSLTExtension):
     def execute(self, context, self_node, input_node, output_parent):
-        output_parent.text = serialize(deepcopy(input_node))
+        output_parent.text = CDATA(serialize_content(input_node))
 
 
 class Processor:
@@ -88,9 +97,21 @@ def gen_omnifeed(sources: list[Path], pages: list[Path],
         if not desc: continue
         title = src_root.findtext('title', '', {None: NS})
         date = src_root.findtext('date', '', {None: NS})
+        categories = src_root.itertext(tag=QName(NS, 'category').text,
+                                       with_tail=False)
         page_root = from_html(page.read_bytes())
         path = str(page.relative_to(out_dir))
         entries.append(E.entry(E.title(title), E.description(desc),
-                               E.date(date), E.path(path), page_root))
+                               E.date(date), E.path(path),
+                               *map(E.category, categories), page_root))
     dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
     dest.write_bytes(serialize(E.feed(*entries), pretty_print=True))
+
+
+def index_categories(pages: Path) -> dict[str, list[int]]:
+    """Index categories from generic global feed."""
+    index = defaultdict(list)
+    for i, entry in enumerate(XML(omnifeed.read_bytes())):
+        for category in entry.itertext(tag='category', with_tail=False):
+            index[category].append(i)
+    return index
author	Nguyễn Gia Phong <mcsinyx@disroot.org>	2023-03-27 22:43:14 +0900
committer	Nguyễn Gia Phong <mcsinyx@disroot.org>	2023-03-27 22:43:14 +0900
commit	e22de4a654fe5c93670fa9ac8098921f577074cd (patch)
tree	97fc5494da7bd1c87c20aca6d3f285bba499d145
parent	abe85863371957151701c2f41739495d02611c6f (diff)
download	rub-e22de4a654fe5c93670fa9ac8098921f577074cd.tar.gz