aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNguyễn Gia Phong <mcsinyx@disroot.org>2022-11-07 19:09:26 +0900
committerNguyễn Gia Phong <mcsinyx@disroot.org>2022-11-07 19:09:26 +0900
commit7e2a8ccd4a65de0ed557b94805edbed6922e3853 (patch)
tree1341b1ac6a708144b09155a5da040d96960e9ad9 /src
downloadfead-7e2a8ccd4a65de0ed557b94805edbed6922e3853.tar.gz
Draft fetching and initial parsing
Diffstat (limited to 'src')
-rwxr-xr-xsrc/fead.py152
1 files changed, 152 insertions, 0 deletions
diff --git a/src/fead.py b/src/fead.py
new file mode 100755
index 0000000..d9b94a3
--- /dev/null
+++ b/src/fead.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# Format mbox as HTML/XML
+# Copyright (C) 2021-2022 Nguyễn Gia Phong
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from asyncio import gather, open_connection, run
+from collections import namedtuple
+from datetime import datetime
+from email.utils import parsedate_to_datetime
+from http.client import HTTPResponse
+from io import BytesIO
+from sys import stdin
+from urllib.error import HTTPError
+from urllib.parse import urljoin, urlsplit
+from xml.etree.ElementTree import fromstring as parse_xml
+
+REQUEST = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'
+Feed = namedtuple('Feed', ('title', 'items'))
+Item = namedtuple('Item', ('title', 'link', 'date', 'description'))
+
+
+class BytesSocket:
+ """Duck socket for HTTPResponse."""
+ def __init__(self, response):
+ self.bytes = response
+
+ def makefile(self, mode, *args, **kwargs):
+ """Return a bytes stream."""
+ assert mode == 'rb'
+ return BytesIO(self.bytes)
+
+
+def parse_rss_item(xml):
+ """Parse given RSS item."""
+ date = None
+ description = ''
+ for child in xml:
+ if child.tag == 'title':
+ title = child.text
+ elif child.tag == 'link':
+ link = child.text
+ elif child.tag == 'pubDate':
+ date = parsedate_to_datetime(child.text)
+ elif child.tag == 'description':
+ description = child.text
+ elif child.tag == 'content:encoded' and not description:
+ description = child.text
+ if not description:
+ description = xml.text
+ return Item(title, link, date, description)
+
+
+def parse_rss(xml, url):
+ """Parse given RSS feed."""
+ title = url
+ items = []
+ for child in xml:
+ if child.tag == 'title':
+ title = child.text
+ elif child.tag == 'item':
+ items.append(parse_rss_item(child))
+ return Feed(title, items)
+
+
+def parse_atom_entry(xml):
+ """Parse given Atom entry."""
+ date = None
+ summary = ''
+ for child in xml:
+ if child.tag.endswith('Atom}title'):
+ title = child.text
+ elif child.tag.endswith('Atom}link'):
+ link = child.attrib['href']
+ elif child.tag.endswith('Atom}published'):
+ date = datetime.fromisoformat(child.text.replace('Z', '+00:00'))
+ elif child.tag.endswith('Atom}summary'):
+ summary = child.text
+ elif child.tag.endswith('Atom}content') and not summary:
+ summary = child.text
+ return Item(title, link, date, summary)
+
+
+def parse_atom(xml, url):
+ """Parse given Atom feed."""
+ title = url
+ entries = []
+ for child in xml:
+ if child.tag.endswith('Atom}title'):
+ title = child.text
+ elif child.tag.endswith('Atom}entry'):
+ entries.append(parse_atom_entry(child))
+ return Feed(title, entries)
+
+
+async def fetch(url):
+ """Fetch web feed from given URL and return it parsed."""
+ if url.scheme == 'https':
+ reader, writer = await open_connection(url.hostname, 443, ssl=True)
+ elif url.scheme == 'http':
+ reader, writer = await open_connection(url.hostname, 80)
+ else:
+ raise ValueError(f'unsupported URL scheme: {url.scheme}')
+ writer.write(REQUEST.format(url.path or '/', url.hostname).encode())
+ response = HTTPResponse(BytesSocket(await reader.read()))
+ writer.close()
+
+ response.begin()
+ with response:
+ if response.status >= 400:
+ raise HTTPError(url.geturl(), response.status, response.reason,
+ response.getheaders(), response)
+ if response.status >= 300:
+ location = urljoin(url.geturl(), response.getheader('Location'))
+ print(url.geturl(), '->', location)
+ return await fetch(urlsplit(location))
+ if response.status >= 200:
+ xml = parse_xml(response.read())
+ if xml.tag == 'rss':
+ assert xml[0].tag == 'channel'
+ return parse_rss(xml[0], url.hostname)
+ if xml.tag.endswith('Atom}feed'):
+ return parse_atom(xml, url.hostname)
+ raise ValueError(f'unsupported feed format at {url.geturl()}')
+ # FIXME: handle informational responses
+
+
+async def fetch_all(urls):
+ """Fetch all given URLs asynchronously and return them parsed."""
+ tasks = gather(*(fetch(urlsplit(url)) for url in urls))
+ try:
+ return await tasks
+ except:
+ tasks.cancel()
+ raise
+
+
+if __name__ == '__main__':
+ feeds = run(fetch_all(stdin.readlines()))
+ for feed in feeds:
+ print(feed)