From 91fc1146505d33629cc80199aa1a5881565e4e50 Mon Sep 17 00:00:00 2001 From: Nguyễn Gia Phong Date: Tue, 8 Nov 2022 18:58:14 +0900 Subject: Parse all necessary information --- src/fead.py | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/src/fead.py b/src/fead.py index d9b94a3..eede317 100755 --- a/src/fead.py +++ b/src/fead.py @@ -21,14 +21,17 @@ from datetime import datetime from email.utils import parsedate_to_datetime from http.client import HTTPResponse from io import BytesIO +from operator import attrgetter from sys import stdin from urllib.error import HTTPError from urllib.parse import urljoin, urlsplit +from warnings import warn from xml.etree.ElementTree import fromstring as parse_xml REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n' -Feed = namedtuple('Feed', ('title', 'items')) -Item = namedtuple('Item', ('title', 'link', 'date', 'description')) +Item = namedtuple('Item', ('title', 'link', 'time', 'summary')) +Advert = namedtuple('Advert', ('source_title', 'source_link', + 'title', 'link', 'time', 'summary')) class BytesSocket: @@ -44,7 +47,7 @@ class BytesSocket: def parse_rss_item(xml): """Parse given RSS item.""" - date = None + time = datetime.fromtimestamp(0).astimezone(None) description = '' for child in xml: if child.tag == 'title': @@ -52,56 +55,61 @@ def parse_rss_item(xml): elif child.tag == 'link': link = child.text elif child.tag == 'pubDate': - date = parsedate_to_datetime(child.text) + time = parsedate_to_datetime(child.text).astimezone(None) elif child.tag == 'description': description = child.text elif child.tag == 'content:encoded' and not description: description = child.text if not description: description = xml.text - return Item(title, link, date, description) + return Item(title, link, time, description) -def parse_rss(xml, url): +def parse_rss(xml, title): """Parse given RSS feed.""" - title = url items = [] for child in xml: if child.tag == 'title': title = child.text + elif child.tag == 'link': + link = child.text elif child.tag == 'item': items.append(parse_rss_item(child)) - return Feed(title, items) + return Advert(title, link, *max(items, key=attrgetter('time'))) def parse_atom_entry(xml): """Parse given Atom entry.""" - date = None + time = datetime.fromtimestamp(0).astimezone(None) summary = '' for child in xml: if child.tag.endswith('Atom}title'): title = child.text elif child.tag.endswith('Atom}link'): - link = child.attrib['href'] + rel = child.attrib.get('rel') + if rel == 'alternate' or not rel: link = child.attrib['href'] elif child.tag.endswith('Atom}published'): - date = datetime.fromisoformat(child.text.replace('Z', '+00:00')) + iso = child.text.replace('Z', '+00:00') # normalized + time = datetime.fromisoformat(iso).astimezone(None) elif child.tag.endswith('Atom}summary'): summary = child.text elif child.tag.endswith('Atom}content') and not summary: summary = child.text - return Item(title, link, date, summary) + return Item(title, link, time, summary) -def parse_atom(xml, url): +def parse_atom(xml, title): """Parse given Atom feed.""" - title = url entries = [] for child in xml: if child.tag.endswith('Atom}title'): title = child.text + elif child.tag.endswith('Atom}link'): + rel = child.attrib.get('rel') + if rel == 'alternate' or not rel: link = child.attrib['href'] elif child.tag.endswith('Atom}entry'): entries.append(parse_atom_entry(child)) - return Feed(title, entries) + return Advert(title, link, *max(entries, key=attrgetter('time'))) async def fetch(url): @@ -119,11 +127,12 @@ async def fetch(url): response.begin() with response: if response.status >= 400: - raise HTTPError(url.geturl(), response.status, response.reason, + raise HTTPError(url.geturl(), response.status, + f'{response.reason}: {url.geturl()}', response.getheaders(), response) if response.status >= 300: location = urljoin(url.geturl(), response.getheader('Location')) - print(url.geturl(), '->', location) + warn(f'{url.geturl()} redirects to {location}') return await fetch(urlsplit(location)) if response.status >= 200: xml = parse_xml(response.read()) @@ -133,7 +142,9 @@ async def fetch(url): if xml.tag.endswith('Atom}feed'): return parse_atom(xml, url.hostname) raise ValueError(f'unsupported feed format at {url.geturl()}') - # FIXME: handle informational responses + raise HTTPError(url.geturl(), response.status, + f'{response.reason}: {url.geturl()}', + response.getheaders(), response) async def fetch_all(urls): @@ -147,6 +158,5 @@ async def fetch_all(urls): if __name__ == '__main__': - feeds = run(fetch_all(stdin.readlines())) - for feed in feeds: - print(feed) + feeds = sorted(run(fetch_all(stdin.readlines())), + key=attrgetter('time'), reverse=True) -- cgit 1.4.1