zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

commit 622855cba32e158a1b0cacea44e21361685d2577
parent 10c3bbd37c63294007b0f3c28d47665fc625785b
Author: Christoph Lohmann <20h@r-36.net>
Date:   Thu, 10 Sep 2020 14:11:51 +0200

Merge branch 'master' of ssh://r-36.net:443/srv/git/zs

Diffstat:
zeitungsschau/feed.py | 12+++++++-----
zeitungsschau/feedemail.py | 10+++++++---
zs | 5+++++
3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py @@ -5,8 +5,9 @@ # by 20h # -from lxml import objectify -from lxml import etree +import lxml +import lxml.objectify +import html from datetime import datetime import dateutil.parser from dateutil.tz import gettz @@ -14,7 +15,6 @@ import requests import hashlib import pytz import codecs -import html import urllib.parse import socket import json @@ -44,9 +44,10 @@ def removenamespaces(xml): elem.tag = elem.tag[nsl:] def parsexml(astr): - xml = objectify.fromstring(astr) + xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8")) removenamespaces(xml) # Throw XML parsing errors so we can blame the feed authors. + #print(lxml.objectify.dump(xml)) return xml def parsetwtxtfeed(astr, uri): @@ -278,6 +279,8 @@ def parseatomfeed(astr): if hasattr(entry, "updated"): article["updated"] = parseiso(entry.updated,\ now) + elif hasattr(entry, "temporary"): + article["updated"] = now elif hasattr(entry, "pubDate"): article["updated"] = parseiso(entry.pubDate,\ now) @@ -397,7 +400,6 @@ def fetch(uri): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((host, port)) s.send(("%s\r\n" % (selector)).encode("utf-8")) - s.shutdown(1) fd = s.makefile("r") fval = fd.read().encode("utf-8") s.close() diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py @@ -13,6 +13,7 @@ from email.header import Header import time import subprocess import lxml.html +import lxml.etree import urllib.parse import html2text @@ -20,10 +21,13 @@ import html2text def normalizeheader(hstr): if len(hstr) == 0: return "" + try: + return lxml.html.fromstring(hstr).text_content().\ + replace(u"\xa0", "").\ + replace("\n", " ").strip() + except lxml.etree.ParserError: + return "" - return lxml.html.fromstring(hstr).text_content().\ - replace(u"\xa0", "").\ - replace("\n", " ").strip() class LocalSendmail(object): cmd="/usr/sbin/sendmail -f \"%s\" \"%s\"" diff --git a/zs b/zs @@ -52,6 +52,11 @@ def run(db, selfeed=None, dryrun=False, onlychanges=False): print("fetch %s" % (feeduri)) curfeed = None rcode = 0 + + """ + # All errors. + (rcode, curfeed) = feed.fetch(feeduri) + """ try: (rcode, curfeed) = feed.fetch(feeduri) except socket.gaierror: