zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

commit 9891ca73640aa4fa074c54e92913f847ba1e756b
parent 9e95a0f332a1bfabfba59c9bad6460e70731db9f
Author: Christoph Lohmann <20h@r-36.net>
Date:   Wed, 11 Nov 2015 22:08:35 +0100

Simplify parsing and unescape text entries.

Diffstat:
zeitungsschau/feed.py | 62+++++++++++++++++++++++++++++---------------------------------
1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py @@ -13,6 +13,7 @@ import requests import hashlib import pytz import codecs +import html def parseiso(dstr, now): try: @@ -32,18 +33,9 @@ def removenamespaces(xml): elem.tag = elem.tag[nsl:] def parsexml(astr): - try: - xml = objectify.fromstring(astr) - removenamespaces(xml) - except etree.XMLSyntaxError: - try: - parser = etree.HTMLParser() - xml = objectify.fromstring(astr, parser) - removenamespaces(xml) - except etree.XMLSyntaxError: - parser = etree.XMLParser(resolve_entities=False) - xml = objectify.fromstring(astr, parser) - removenamespaces(xml) + xml = objectify.fromstring(astr) + removenamespaces(xml) + # Throw XML parsing errors so we can blame the feed authors. return xml def parse(astr): @@ -57,10 +49,6 @@ def parse(astr): isrdf = False now = datetime.now(pytz.utc) - feede = xml.xpath(".//feed") - if len(feede) > 0: - xml = feede[0] - if hasattr(xml, "channel"): if hasattr(xml, "item"): isrdf = True @@ -71,11 +59,11 @@ def parse(astr): feed["title"] = "" for e in ("title", "description"): if hasattr(xml, e): - feed[e] = str(xml[e]) + feed[e] = html.unescape(str(xml[e])) if hasattr(xml, "image") and hasattr(xml.image, "title"): if "title" not in feed: - feed["title"] = str(xml.image.title) + feed["title"] = html.unescape(str(xml.image.title)) if hasattr(xml, "updated"): feed["updated"] = parseiso(xml.updated, now) @@ -93,25 +81,25 @@ def parse(astr): feed["link"] = str(xml.link) if hasattr(xml, "webmaster"): - feed["email"] = str(xml.webmaster) + feed["email"] = html.unescape(str(xml.webmaster)) elif hasattr(xml, "owner") and hasattr(xml.owner, "email"): - feed["email"] = str(xml.owner.email) + feed["email"] = html.unescape(str(xml.owner.email)) elif hasattr(xml, "author") and hasattr(xml.author, "email"): - feed["email"] = str(xml.author.email) + feed["email"] = html.unescape(str(xml.author.email)) elif hasattr(xml, "webMaster"): - feed["email"] = str(xml.webMaster) + feed["email"] = html.unescape(str(xml.webMaster)) elif hasattr(xml, "managingeditor"): - feed["email"] = str(xml.managingeditor) + feed["email"] = html.unescape(str(xml.managingeditor)) elif hasattr(xml, "managingEditor"): - feed["email"] = str(xml.managingEditor) + feed["email"] = html.unescape(str(xml.managingEditor)) if hasattr(xml, "author"): if hasattr(xml.author, "name"): - feed["author"] = str(xml.author.name) + feed["author"] = html.unescape(str(xml.author.name)) else: - feed["author"] = str(xml.author) + feed["author"] = html.unescape(str(xml.author)) elif hasattr(xml, "creator"): - feed["author"] = str(xml.creator) + feed["author"] = html.unescape(str(xml.creator)) entryname = "entry" if isrss == True or isrdf == True: @@ -123,7 +111,8 @@ def parse(astr): article = {} # title if hasattr(entry, "title"): - article["title"] = str(entry["title"]) + article["title"] = html.unescape(\ + str(entry["title"])) # link if hasattr(entry, "link"): @@ -149,8 +138,9 @@ def parse(astr): hasattr(entry.group, "content"): if "url" in entry.group.content: article["file"] = \ + html.unescape(\ str(entry.group.content.\ - attrib["file"]) + attrib["file"])) # updated try: @@ -171,19 +161,25 @@ def parse(astr): # author if hasattr(entry, "author"): if hasattr(entry.author, "name"): - article["author"] = str(entry.author.name) + article["author"] = html.unescape(\ + str(entry.author.name)) else: - article["author"] = str(entry.author) + article["author"] = html.unescape(\ + str(entry.author)) elif hasattr(entry, "creator"): - article["author"] = str(entry.creator) + article["author"] = html.unescape(\ + str(entry.creator)) # tags if hasattr(entry, "category"): article["tags"] = [] for cat in entry["category"][:]: - article["tags"].append(str(cat)) + article["tags"].append(\ + html.unescape(\ + str(cat))) # text + # Don't unescape the text, it might contain HTML. if hasattr(entry, "encoded"): article["text"] = str(entry.encoded) elif hasattr(entry, "content"):