aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNguyễn Gia Phong <mcsinyx@disroot.org>2022-11-08 18:58:14 +0900
committerNguyễn Gia Phong <mcsinyx@disroot.org>2022-11-08 18:58:14 +0900
commit91fc1146505d33629cc80199aa1a5881565e4e50 (patch)
tree472fb0d66c9c9ae84730764783c68ffcaba7757e /src
parent7e2a8ccd4a65de0ed557b94805edbed6922e3853 (diff)
downloadfead-91fc1146505d33629cc80199aa1a5881565e4e50.tar.gz
Parse all necessary information
Diffstat (limited to 'src')
-rwxr-xr-xsrc/fead.py52
1 files changed, 31 insertions, 21 deletions
diff --git a/src/fead.py b/src/fead.py
index d9b94a3..eede317 100755
--- a/src/fead.py
+++ b/src/fead.py
@@ -21,14 +21,17 @@ from datetime import datetime
from email.utils import parsedate_to_datetime
from http.client import HTTPResponse
from io import BytesIO
+from operator import attrgetter
from sys import stdin
from urllib.error import HTTPError
from urllib.parse import urljoin, urlsplit
+from warnings import warn
from xml.etree.ElementTree import fromstring as parse_xml
REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'
-Feed = namedtuple('Feed', ('title', 'items'))
-Item = namedtuple('Item', ('title', 'link', 'date', 'description'))
+Item = namedtuple('Item', ('title', 'link', 'time', 'summary'))
+Advert = namedtuple('Advert', ('source_title', 'source_link',
+ 'title', 'link', 'time', 'summary'))
class BytesSocket:
@@ -44,7 +47,7 @@ class BytesSocket:
def parse_rss_item(xml):
"""Parse given RSS item."""
- date = None
+ time = datetime.fromtimestamp(0).astimezone(None)
description = ''
for child in xml:
if child.tag == 'title':
@@ -52,56 +55,61 @@ def parse_rss_item(xml):
elif child.tag == 'link':
link = child.text
elif child.tag == 'pubDate':
- date = parsedate_to_datetime(child.text)
+ time = parsedate_to_datetime(child.text).astimezone(None)
elif child.tag == 'description':
description = child.text
elif child.tag == 'content:encoded' and not description:
description = child.text
if not description:
description = xml.text
- return Item(title, link, date, description)
+ return Item(title, link, time, description)
-def parse_rss(xml, url):
+def parse_rss(xml, title):
"""Parse given RSS feed."""
- title = url
items = []
for child in xml:
if child.tag == 'title':
title = child.text
+ elif child.tag == 'link':
+ link = child.text
elif child.tag == 'item':
items.append(parse_rss_item(child))
- return Feed(title, items)
+ return Advert(title, link, *max(items, key=attrgetter('time')))
def parse_atom_entry(xml):
"""Parse given Atom entry."""
- date = None
+ time = datetime.fromtimestamp(0).astimezone(None)
summary = ''
for child in xml:
if child.tag.endswith('Atom}title'):
title = child.text
elif child.tag.endswith('Atom}link'):
- link = child.attrib['href']
+ rel = child.attrib.get('rel')
+ if rel == 'alternate' or not rel: link = child.attrib['href']
elif child.tag.endswith('Atom}published'):
- date = datetime.fromisoformat(child.text.replace('Z', '+00:00'))
+ iso = child.text.replace('Z', '+00:00') # normalized
+ time = datetime.fromisoformat(iso).astimezone(None)
elif child.tag.endswith('Atom}summary'):
summary = child.text
elif child.tag.endswith('Atom}content') and not summary:
summary = child.text
- return Item(title, link, date, summary)
+ return Item(title, link, time, summary)
-def parse_atom(xml, url):
+def parse_atom(xml, title):
"""Parse given Atom feed."""
- title = url
entries = []
for child in xml:
if child.tag.endswith('Atom}title'):
title = child.text
+ elif child.tag.endswith('Atom}link'):
+ rel = child.attrib.get('rel')
+ if rel == 'alternate' or not rel: link = child.attrib['href']
elif child.tag.endswith('Atom}entry'):
entries.append(parse_atom_entry(child))
- return Feed(title, entries)
+ return Advert(title, link, *max(entries, key=attrgetter('time')))
async def fetch(url):
@@ -119,11 +127,12 @@ async def fetch(url):
response.begin()
with response:
if response.status >= 400:
- raise HTTPError(url.geturl(), response.status, response.reason,
+ raise HTTPError(url.geturl(), response.status,
+ f'{response.reason}: {url.geturl()}',
response.getheaders(), response)
if response.status >= 300:
location = urljoin(url.geturl(), response.getheader('Location'))
- print(url.geturl(), '->', location)
+ warn(f'{url.geturl()} redirects to {location}')
return await fetch(urlsplit(location))
if response.status >= 200:
xml = parse_xml(response.read())
@@ -133,7 +142,9 @@ async def fetch(url):
if xml.tag.endswith('Atom}feed'):
return parse_atom(xml, url.hostname)
raise ValueError(f'unsupported feed format at {url.geturl()}')
- # FIXME: handle informational responses
+ raise HTTPError(url.geturl(), response.status,
+ f'{response.reason}: {url.geturl()}',
+ response.getheaders(), response)
async def fetch_all(urls):
@@ -147,6 +158,5 @@ async def fetch_all(urls):
if __name__ == '__main__':
- feeds = run(fetch_all(stdin.readlines()))
- for feed in feeds:
- print(feed)
+ feeds = sorted(run(fetch_all(stdin.readlines())),
+ key=attrgetter('time'), reverse=True)