Comply better with RFC 4287

author: Nguyễn Gia Phong <cnx@loang.net> 2023-08-18 12:39:30 +0900
committer: Nguyễn Gia Phong <cnx@loang.net> 2024-01-17 17:24:14 +0900
commit: 355050b447929113577301a97015a6142a86f4a8 (patch)
tree: 456a4dbdbe0a603650f56754db06ee520471073a /src
parent: 453cc4a0751bb5f29d4a8c7b841c345653a89f87 (diff)
download: fead-355050b447929113577301a97015a6142a86f4a8.tar.gz
1 files changed, 20 insertions, 13 deletions
diff --git a/src/fead.py b/src/fead.py
index e9abd90..a900fce 100755
--- a/src/fead.py
+++ b/src/fead.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Advert generator from web feeds
-# Copyright (C) 2022, 2024  Nguyễn Gia Phong
+# Copyright (C) 2022-2024  Nguyễn Gia Phong
 # Copyright (C) 2023  Ngô Ngọc Đức Huy
 #
 # This program is free software: you can redistribute it and/or modify
@@ -40,6 +40,7 @@ from xml.etree.ElementTree import (fromstring as parse_xml,
 
 REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'
 HTML_TAG = regex('<.+?>')
+XHTML_NS = '{http://www.w3.org/1999/xhtml}'
 
 Advert = namedtuple('Advert', ('source_title', 'source_link',
                                'title', 'link', 'time', 'summary'))
@@ -109,27 +110,33 @@ def parse_rss(xml, title):
     return title, link, items
 
 
+def unparse_atom_text(xml):
+    """Extract Atom Text construct."""
+    if xml.attrib.get('type') == 'xhtml':
+        assert len(xml) == 1 and xml[0].tag.endswith(f'{XHTML_NS}div')
+        for elem in xml[0].iter():
+            elem.tag = elem.tag[len(XHTML_NS):]
+        return unparse_xml(xml[0]).decode()
+    return xml.text
+
+
 def parse_atom_entry(xml):
     """Parse given Atom entry."""
-    time = datetime.fromtimestamp(0)
+    time = None  # RFC 4287 requires atom:updated
     summary = ''
     for child in xml:
         if child.tag.endswith('Atom}title'):
-            title = child.text
+            title = unparse_atom_text(child)
         elif child.tag.endswith('Atom}link'):
             rel = child.attrib.get('rel')
             if rel == 'alternate' or not rel: link = child.attrib['href']
-        elif child.tag.endswith('Atom}published'):
+        elif (child.tag.endswith('Atom}published')
+              or child.tag.endswith('Atom}updated') and time is None):
             iso = child.text.replace('Z', '+00:00')  # normalized
             time = datetime.fromisoformat(iso)
-        elif child.tag.endswith('Atom}summary'):
-            summary = child.text
-        elif child.tag.endswith('Atom}content') and not summary:
-            if child.attrib.get('type') == 'xhtml':
-                assert len(child) == 1 and child[0].tag.endswith('xhtml}div')
-                summary = unparse_xml(child[0]).decode()
-            else:
-                summary = child.text
+        elif (child.tag.endswith('Atom}summary')
+              or child.tag.endswith('Atom}content') and not summary):
+            summary = unparse_atom_text(child)
     return title, link, time, summary
 
 
@@ -138,7 +145,7 @@ def parse_atom(xml, title, link):
     entries = []
     for child in xml:
         if child.tag.endswith('Atom}title'):
-            title = child.text
+            title = unparse_atom_text(child)
         elif child.tag.endswith('Atom}link'):
             rel = child.attrib.get('rel')
             if rel == 'alternate' or not rel: link = child.attrib['href']
author	Nguyễn Gia Phong <cnx@loang.net>	2023-08-18 12:39:30 +0900
committer	Nguyễn Gia Phong <cnx@loang.net>	2024-01-17 17:24:14 +0900
commit	355050b447929113577301a97015a6142a86f4a8 (patch)
tree	456a4dbdbe0a603650f56754db06ee520471073a /src
parent	453cc4a0751bb5f29d4a8c7b841c345653a89f87 (diff)
download	fead-355050b447929113577301a97015a6142a86f4a8.tar.gz