about summary refs log tree commit diff
path: root/src/rub/xml.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/rub/xml.py')
-rw-r--r--src/rub/xml.py27
1 files changed, 24 insertions, 3 deletions
diff --git a/src/rub/xml.py b/src/rub/xml.py
index 4c3a2ae..87b5572 100644
--- a/src/rub/xml.py
+++ b/src/rub/xml.py
@@ -16,18 +16,27 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with rub.  If not, see <https://www.gnu.org/licenses/>.
 
+from collections import defaultdict
 from copy import deepcopy
+from functools import cache
 from pathlib import Path
 
 from lxml.builder import E
 from lxml.html import document_fromstring as from_html
-from lxml.etree import QName, XML, XSLT, XSLTExtension, tostring as serialize
+from lxml.etree import (CDATA, QName, XML, XSLT, XSLTExtension,
+                        tostring as serialize)
 
 __all__ = ['NS', 'Processor', 'recurse']
 
 NS = 'https://rub.parody'
 
 
+def serialize_content(element) -> str:
+    text = element.text.encode() if element.text else b''
+    children = b''.join(serialize(deepcopy(i)) for i in element)
+    return text + children
+
+
 def recurse(extension, context, input_node, output_parent):
     """Apply template recursively on input node."""
     output = deepcopy(input_node)
@@ -60,7 +69,7 @@ class Evaluator(XSLTExtension):
 
 class Serializer(XSLTExtension):
     def execute(self, context, self_node, input_node, output_parent):
-        output_parent.text = serialize(deepcopy(input_node))
+        output_parent.text = CDATA(serialize_content(input_node))
 
 
 class Processor:
@@ -88,9 +97,21 @@ def gen_omnifeed(sources: list[Path], pages: list[Path],
         if not desc: continue
         title = src_root.findtext('title', '', {None: NS})
         date = src_root.findtext('date', '', {None: NS})
+        categories = src_root.itertext(tag=QName(NS, 'category').text,
+                                       with_tail=False)
         page_root = from_html(page.read_bytes())
         path = str(page.relative_to(out_dir))
         entries.append(E.entry(E.title(title), E.description(desc),
-                               E.date(date), E.path(path), page_root))
+                               E.date(date), E.path(path),
+                               *map(E.category, categories), page_root))
     dest.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
     dest.write_bytes(serialize(E.feed(*entries), pretty_print=True))
+
+
+def index_categories(pages: Path) -> dict[str, list[int]]:
+    """Index categories from generic global feed."""
+    index = defaultdict(list)
+    for i, entry in enumerate(XML(omnifeed.read_bytes())):
+        for category in entry.itertext(tag='category', with_tail=False):
+            index[category].append(i)
+    return index