From 355050b447929113577301a97015a6142a86f4a8 Mon Sep 17 00:00:00 2001 From: Nguyễn Gia Phong Date: Fri, 18 Aug 2023 12:39:30 +0900 Subject: Comply better with RFC 4287 --- src/fead.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/fead.py b/src/fead.py index e9abd90..a900fce 100755 --- a/src/fead.py +++ b/src/fead.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # Advert generator from web feeds -# Copyright (C) 2022, 2024 Nguyễn Gia Phong +# Copyright (C) 2022-2024 Nguyễn Gia Phong # Copyright (C) 2023 Ngô Ngọc Đức Huy # # This program is free software: you can redistribute it and/or modify @@ -40,6 +40,7 @@ from xml.etree.ElementTree import (fromstring as parse_xml, REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n' HTML_TAG = regex('<.+?>') +XHTML_NS = '{http://www.w3.org/1999/xhtml}' Advert = namedtuple('Advert', ('source_title', 'source_link', 'title', 'link', 'time', 'summary')) @@ -109,27 +110,33 @@ def parse_rss(xml, title): return title, link, items +def unparse_atom_text(xml): + """Extract Atom Text construct.""" + if xml.attrib.get('type') == 'xhtml': + assert len(xml) == 1 and xml[0].tag.endswith(f'{XHTML_NS}div') + for elem in xml[0].iter(): + elem.tag = elem.tag[len(XHTML_NS):] + return unparse_xml(xml[0]).decode() + return xml.text + + def parse_atom_entry(xml): """Parse given Atom entry.""" - time = datetime.fromtimestamp(0) + time = None # RFC 4287 requires atom:updated summary = '' for child in xml: if child.tag.endswith('Atom}title'): - title = child.text + title = unparse_atom_text(child) elif child.tag.endswith('Atom}link'): rel = child.attrib.get('rel') if rel == 'alternate' or not rel: link = child.attrib['href'] - elif child.tag.endswith('Atom}published'): + elif (child.tag.endswith('Atom}published') + or child.tag.endswith('Atom}updated') and time is None): iso = child.text.replace('Z', '+00:00') # normalized time = datetime.fromisoformat(iso) - elif child.tag.endswith('Atom}summary'): - summary = child.text - elif child.tag.endswith('Atom}content') and not summary: - if child.attrib.get('type') == 'xhtml': - assert len(child) == 1 and child[0].tag.endswith('xhtml}div') - summary = unparse_xml(child[0]).decode() - else: - summary = child.text + elif (child.tag.endswith('Atom}summary') + or child.tag.endswith('Atom}content') and not summary): + summary = unparse_atom_text(child) return title, link, time, summary @@ -138,7 +145,7 @@ def parse_atom(xml, title, link): entries = [] for child in xml: if child.tag.endswith('Atom}title'): - title = child.text + title = unparse_atom_text(child) elif child.tag.endswith('Atom}link'): rel = child.attrib.get('rel') if rel == 'alternate' or not rel: link = child.attrib['href'] -- cgit 1.4.1