diff options
author | Nguyễn Gia Phong <mcsinyx@disroot.org> | 2022-11-07 19:09:26 +0900 |
---|---|---|
committer | Nguyễn Gia Phong <mcsinyx@disroot.org> | 2022-11-07 19:09:26 +0900 |
commit | 7e2a8ccd4a65de0ed557b94805edbed6922e3853 (patch) | |
tree | 1341b1ac6a708144b09155a5da040d96960e9ad9 /src | |
download | fead-7e2a8ccd4a65de0ed557b94805edbed6922e3853.tar.gz |
Draft fetching and initial parsing
Diffstat (limited to 'src')
-rwxr-xr-x | src/fead.py | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/src/fead.py b/src/fead.py new file mode 100755 index 0000000..d9b94a3 --- /dev/null +++ b/src/fead.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# Format mbox as HTML/XML +# Copyright (C) 2021-2022 Nguyễn Gia Phong +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from asyncio import gather, open_connection, run +from collections import namedtuple +from datetime import datetime +from email.utils import parsedate_to_datetime +from http.client import HTTPResponse +from io import BytesIO +from sys import stdin +from urllib.error import HTTPError +from urllib.parse import urljoin, urlsplit +from xml.etree.ElementTree import fromstring as parse_xml + +REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n' +Feed = namedtuple('Feed', ('title', 'items')) +Item = namedtuple('Item', ('title', 'link', 'date', 'description')) + + +class BytesSocket: + """Duck socket for HTTPResponse.""" + def __init__(self, response): + self.bytes = response + + def makefile(self, mode, *args, **kwargs): + """Return a bytes stream.""" + assert mode == 'rb' + return BytesIO(self.bytes) + + +def parse_rss_item(xml): + """Parse given RSS item.""" + date = None + description = '' + for child in xml: + if child.tag == 'title': + title = child.text + elif child.tag == 'link': + link = child.text + elif child.tag == 'pubDate': + date = parsedate_to_datetime(child.text) + elif child.tag == 'description': + description = child.text + elif child.tag == 'content:encoded' and not description: + description = child.text + if not description: + description = xml.text + return Item(title, link, date, description) + + +def parse_rss(xml, url): + """Parse given RSS feed.""" + title = url + items = [] + for child in xml: + if child.tag == 'title': + title = child.text + elif child.tag == 'item': + items.append(parse_rss_item(child)) + return Feed(title, items) + + +def parse_atom_entry(xml): + """Parse given Atom entry.""" + date = None + summary = '' + for child in xml: + if child.tag.endswith('Atom}title'): + title = child.text + elif child.tag.endswith('Atom}link'): + link = child.attrib['href'] + elif child.tag.endswith('Atom}published'): + date = datetime.fromisoformat(child.text.replace('Z', '+00:00')) + elif child.tag.endswith('Atom}summary'): + summary = child.text + elif child.tag.endswith('Atom}content') and not summary: + summary = child.text + return Item(title, link, date, summary) + + +def parse_atom(xml, url): + """Parse given Atom feed.""" + title = url + entries = [] + for child in xml: + if child.tag.endswith('Atom}title'): + title = child.text + elif child.tag.endswith('Atom}entry'): + entries.append(parse_atom_entry(child)) + return Feed(title, entries) + + +async def fetch(url): + """Fetch web feed from given URL and return it parsed.""" + if url.scheme == 'https': + reader, writer = await open_connection(url.hostname, 443, ssl=True) + elif url.scheme == 'http': + reader, writer = await open_connection(url.hostname, 80) + else: + raise ValueError(f'unsupported URL scheme: {url.scheme}') + writer.write(REQUEST.format(url.path or '/', url.hostname).encode()) + response = HTTPResponse(BytesSocket(await reader.read())) + writer.close() + + response.begin() + with response: + if response.status >= 400: + raise HTTPError(url.geturl(), response.status, response.reason, + response.getheaders(), response) + if response.status >= 300: + location = urljoin(url.geturl(), response.getheader('Location')) + print(url.geturl(), '->', location) + return await fetch(urlsplit(location)) + if response.status >= 200: + xml = parse_xml(response.read()) + if xml.tag == 'rss': + assert xml[0].tag == 'channel' + return parse_rss(xml[0], url.hostname) + if xml.tag.endswith('Atom}feed'): + return parse_atom(xml, url.hostname) + raise ValueError(f'unsupported feed format at {url.geturl()}') + # FIXME: handle informational responses + + +async def fetch_all(urls): + """Fetch all given URLs asynchronously and return them parsed.""" + tasks = gather(*(fetch(urlsplit(url)) for url in urls)) + try: + return await tasks + except: + tasks.cancel() + raise + + +if __name__ == '__main__': + feeds = run(fetch_all(stdin.readlines())) + for feed in feeds: + print(feed) |