aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNguyễn Gia Phong <cnx@loang.net>2023-08-18 12:39:30 +0900
committerNguyễn Gia Phong <cnx@loang.net>2024-01-17 17:24:14 +0900
commit355050b447929113577301a97015a6142a86f4a8 (patch)
tree456a4dbdbe0a603650f56754db06ee520471073a /src
parent453cc4a0751bb5f29d4a8c7b841c345653a89f87 (diff)
downloadfead-355050b447929113577301a97015a6142a86f4a8.tar.gz
Comply better with RFC 4287
Diffstat (limited to 'src')
-rwxr-xr-xsrc/fead.py33
1 files changed, 20 insertions, 13 deletions
diff --git a/src/fead.py b/src/fead.py
index e9abd90..a900fce 100755
--- a/src/fead.py
+++ b/src/fead.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
# Advert generator from web feeds
-# Copyright (C) 2022, 2024 Nguyễn Gia Phong
+# Copyright (C) 2022-2024 Nguyễn Gia Phong
# Copyright (C) 2023 Ngô Ngọc Đức Huy
#
# This program is free software: you can redistribute it and/or modify
@@ -40,6 +40,7 @@ from xml.etree.ElementTree import (fromstring as parse_xml,
REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'
HTML_TAG = regex('<.+?>')
+XHTML_NS = '{http://www.w3.org/1999/xhtml}'
Advert = namedtuple('Advert', ('source_title', 'source_link',
'title', 'link', 'time', 'summary'))
@@ -109,27 +110,33 @@ def parse_rss(xml, title):
return title, link, items
+def unparse_atom_text(xml):
+ """Extract Atom Text construct."""
+ if xml.attrib.get('type') == 'xhtml':
+ assert len(xml) == 1 and xml[0].tag.endswith(f'{XHTML_NS}div')
+ for elem in xml[0].iter():
+ elem.tag = elem.tag[len(XHTML_NS):]
+ return unparse_xml(xml[0]).decode()
+ return xml.text
+
+
def parse_atom_entry(xml):
"""Parse given Atom entry."""
- time = datetime.fromtimestamp(0)
+ time = None # RFC 4287 requires atom:updated
summary = ''
for child in xml:
if child.tag.endswith('Atom}title'):
- title = child.text
+ title = unparse_atom_text(child)
elif child.tag.endswith('Atom}link'):
rel = child.attrib.get('rel')
if rel == 'alternate' or not rel: link = child.attrib['href']
- elif child.tag.endswith('Atom}published'):
+ elif (child.tag.endswith('Atom}published')
+ or child.tag.endswith('Atom}updated') and time is None):
iso = child.text.replace('Z', '+00:00') # normalized
time = datetime.fromisoformat(iso)
- elif child.tag.endswith('Atom}summary'):
- summary = child.text
- elif child.tag.endswith('Atom}content') and not summary:
- if child.attrib.get('type') == 'xhtml':
- assert len(child) == 1 and child[0].tag.endswith('xhtml}div')
- summary = unparse_xml(child[0]).decode()
- else:
- summary = child.text
+ elif (child.tag.endswith('Atom}summary')
+ or child.tag.endswith('Atom}content') and not summary):
+ summary = unparse_atom_text(child)
return title, link, time, summary
@@ -138,7 +145,7 @@ def parse_atom(xml, title, link):
entries = []
for child in xml:
if child.tag.endswith('Atom}title'):
- title = child.text
+ title = unparse_atom_text(child)
elif child.tag.endswith('Atom}link'):
rel = child.attrib.get('rel')
if rel == 'alternate' or not rel: link = child.attrib['href']