Draft fetching and initial parsing

author: Nguyễn Gia Phong <mcsinyx@disroot.org> 2022-11-07 19:09:26 +0900
committer: Nguyễn Gia Phong <mcsinyx@disroot.org> 2022-11-07 19:09:26 +0900
commit: 7e2a8ccd4a65de0ed557b94805edbed6922e3853 (patch)
tree: 1341b1ac6a708144b09155a5da040d96960e9ad9 /src
download: fead-7e2a8ccd4a65de0ed557b94805edbed6922e3853.tar.gz
1 files changed, 152 insertions, 0 deletions
diff --git a/src/fead.py b/src/fead.py
new file mode 100755
index 0000000..d9b94a3
--- /dev/null
+++ b/src/fead.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# Format mbox as HTML/XML
+# Copyright (C) 2021-2022  Nguyễn Gia Phong
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from asyncio import gather, open_connection, run
+from collections import namedtuple
+from datetime import datetime
+from email.utils import parsedate_to_datetime
+from http.client import HTTPResponse
+from io import BytesIO
+from sys import stdin
+from urllib.error import HTTPError
+from urllib.parse import urljoin, urlsplit
+from xml.etree.ElementTree import fromstring as parse_xml
+
+REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'
+Feed = namedtuple('Feed', ('title', 'items'))
+Item = namedtuple('Item', ('title', 'link', 'date', 'description'))
+
+
+class BytesSocket:
+    """Duck socket for HTTPResponse."""
+    def __init__(self, response):
+        self.bytes = response
+
+    def makefile(self, mode, *args, **kwargs):
+        """Return a bytes stream."""
+        assert mode == 'rb'
+        return BytesIO(self.bytes)
+
+
+def parse_rss_item(xml):
+    """Parse given RSS item."""
+    date = None
+    description = ''
+    for child in xml:
+        if child.tag == 'title':
+            title = child.text
+        elif child.tag == 'link':
+            link = child.text
+        elif child.tag == 'pubDate':
+            date = parsedate_to_datetime(child.text)
+        elif child.tag == 'description':
+            description = child.text
+        elif child.tag == 'content:encoded' and not description:
+            description = child.text
+    if not description:
+        description = xml.text
+    return Item(title, link, date, description)
+
+
+def parse_rss(xml, url):
+    """Parse given RSS feed."""
+    title = url
+    items = []
+    for child in xml:
+        if child.tag == 'title':
+            title = child.text
+        elif child.tag == 'item':
+            items.append(parse_rss_item(child))
+    return Feed(title, items)
+
+
+def parse_atom_entry(xml):
+    """Parse given Atom entry."""
+    date = None
+    summary = ''
+    for child in xml:
+        if child.tag.endswith('Atom}title'):
+            title = child.text
+        elif child.tag.endswith('Atom}link'):
+            link = child.attrib['href']
+        elif child.tag.endswith('Atom}published'):
+            date = datetime.fromisoformat(child.text.replace('Z', '+00:00'))
+        elif child.tag.endswith('Atom}summary'):
+            summary = child.text
+        elif child.tag.endswith('Atom}content') and not summary:
+            summary = child.text
+    return Item(title, link, date, summary)
+
+
+def parse_atom(xml, url):
+    """Parse given Atom feed."""
+    title = url
+    entries = []
+    for child in xml:
+        if child.tag.endswith('Atom}title'):
+            title = child.text
+        elif child.tag.endswith('Atom}entry'):
+            entries.append(parse_atom_entry(child))
+    return Feed(title, entries)
+
+
+async def fetch(url):
+    """Fetch web feed from given URL and return it parsed."""
+    if url.scheme == 'https':
+        reader, writer = await open_connection(url.hostname, 443, ssl=True)
+    elif url.scheme == 'http':
+        reader, writer = await open_connection(url.hostname, 80)
+    else:
+        raise ValueError(f'unsupported URL scheme: {url.scheme}')
+    writer.write(REQUEST.format(url.path or '/', url.hostname).encode())
+    response = HTTPResponse(BytesSocket(await reader.read()))
+    writer.close()
+
+    response.begin()
+    with response:
+        if response.status >= 400:
+            raise HTTPError(url.geturl(), response.status, response.reason,
+                            response.getheaders(), response)
+        if response.status >= 300:
+            location = urljoin(url.geturl(), response.getheader('Location'))
+            print(url.geturl(), '->', location)
+            return await fetch(urlsplit(location))
+        if response.status >= 200:
+            xml = parse_xml(response.read())
+            if xml.tag == 'rss':
+                assert xml[0].tag == 'channel'
+                return parse_rss(xml[0], url.hostname)
+            if xml.tag.endswith('Atom}feed'):
+                return parse_atom(xml, url.hostname)
+            raise ValueError(f'unsupported feed format at {url.geturl()}')
+        # FIXME: handle informational responses
+
+
+async def fetch_all(urls):
+    """Fetch all given URLs asynchronously and return them parsed."""
+    tasks = gather(*(fetch(urlsplit(url)) for url in urls))
+    try:
+        return await tasks
+    except:
+        tasks.cancel()
+        raise
+
+
+if __name__ == '__main__':
+    feeds = run(fetch_all(stdin.readlines()))
+    for feed in feeds:
+        print(feed)
author	Nguyễn Gia Phong <mcsinyx@disroot.org>	2022-11-07 19:09:26 +0900
committer	Nguyễn Gia Phong <mcsinyx@disroot.org>	2022-11-07 19:09:26 +0900
commit	7e2a8ccd4a65de0ed557b94805edbed6922e3853 (patch)
tree	1341b1ac6a708144b09155a5da040d96960e9ad9 /src
download	fead-7e2a8ccd4a65de0ed557b94805edbed6922e3853.tar.gz