zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | README | LICENSE

feed.py (10854B)


      1 #
      2 # See LICENSE for licensing details.
      3 #
      4 # Copy me if you can.
      5 # by 20h
      6 #
      7 
      8 import lxml
      9 import lxml.objectify
     10 import html
     11 from datetime import datetime
     12 import dateutil.parser
     13 from dateutil.tz import gettz
     14 import requests
     15 import hashlib
     16 import pytz
     17 import codecs
     18 import urllib.parse
     19 import socket
     20 import json
     21 import pytz
     22 
     23 def parseiso(dstr, now):
     24 	def gettzinfo(zone, offset):
     25 		try:
     26 			return gettz(zone)
     27 		except:
     28 			return None
     29 
     30 	try:
     31 		return dateutil.parser.parse(str(dstr), default=now,
     32 				tzinfos=gettzinfo)
     33 	except:
     34 		# Invalid time format. Could not be parsed.
     35 		return now
     36 
     37 def removenamespaces(xml):
     38 	for key in xml.nsmap:
     39 		nsstr = u'{%s}' % (xml.nsmap[key])
     40 		nsl = len(nsstr)
     41 
     42 		for elem in xml.getiterator():
     43 			if elem.tag.startswith(nsstr):
     44 				elem.tag = elem.tag[nsl:]
     45 
     46 def parsexml(astr):
     47 	xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8"))
     48 	removenamespaces(xml)
     49 	# Throw XML parsing errors so we can blame the feed authors.
     50 	#print(lxml.objectify.dump(xml))
     51 	return xml
     52 
     53 def parsetwtxtfeed(astr, uri):
     54 	feed = {}
     55 	articles = []
     56 	now = datetime.now(pytz.utc)
     57 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
     58 
     59 	feed["title"] = uri
     60 	feed["link"] = uri
     61 	feed["updated"] = now
     62 
     63 	lines = astr.split("\n");
     64 	for line in lines:
     65 		# People already reinterpret the standard. :(
     66 		if len(line) == 0:
     67 			continue
     68 		if line[0] == "#":
     69 			continue
     70 
     71 		createdtxt, ltext = line.split("\t", 1)
     72 		created = parseiso(createdtxt, now)
     73 
     74 		article = {}
     75 		article["id"] = createdtxt
     76 		article["title"] = ltext
     77 		article["text"] = ltext
     78 		article["uuid"] = createdtxt
     79 		article["updated"] = created
     80 
     81 		if article["updated"] == now:
     82 			article["uuid"] = ""
     83 		else:
     84 			article["uuid"] = "%s" % (article["updated"])
     85 
     86 		articles.append(article)
     87 
     88 	feed["articles"] = articles
     89 
     90 	return feed
     91 
     92 def parsejsonfeed(astr):
     93 	js = json.loads(astr)
     94 
     95 	feed = {}
     96 	articles = []
     97 	now = datetime.now(pytz.utc)
     98 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
     99 
    100 	if "title" in js:
    101 		feed["title"] = js["title"]
    102 	if "description" in js:
    103 		feed["description"] = js["description"]
    104 	if "home_page_url" in js:
    105 		feed["link"] = js["home_page_url"]
    106 	if "feed_url" in js:
    107 		feed["link"] = js["feed_url"]
    108 	if "author" in js:
    109 		if "name" in js["author"]:
    110 			feed["author"] = js["author"]["name"]
    111 	feed["updated"] = now
    112 
    113 	if "items" in js:
    114 		for item in js["items"]:
    115 			article = {}
    116 			if "url" in item:
    117 				article["file"] = item["url"]
    118 			if "title" in item:
    119 				article["title"] = item["title"]
    120 			if "id" in item:
    121 				article["id"] = item["id"]
    122 			else:
    123 				if "link" in article:
    124 					article["id"] = article["link"]
    125 				elif "file" in article:
    126 					article["id"] = article["file"]
    127 				else:
    128 					article["id"] = article["text"][:30]
    129 
    130 			if "summary" in item:
    131 				article["text"] = html.unescape(item["summary"])
    132 			if "content_html" in item:
    133 				article["text"] = html.unescape(item["content_html"])
    134 			if "content_text" in item:
    135 				article["text"] = html.unescape(item["content_text"])
    136 			if "date_published" in item:
    137 				article["updated"] = \
    138 					dateutil.parser.parse(item["date_published"])
    139 			else:
    140 				article["updated"] = now
    141 
    142 			if article["updated"] == now:
    143 				article["uuid"] = ""
    144 			else:
    145 				article["uuid"] = "%s" % (article["updated"])
    146 
    147 			for e in ("id", "title", "file"):
    148 				if e in article:
    149 					article["uuid"] = "%s-%s" % \
    150 						(article["uuid"],\
    151 						 article[e])
    152 
    153 			def mkuuid(s):
    154 				return hashlib.sha256(str(s).\
    155 					encode("utf8")).hexdigest()
    156 			if len(article["uuid"]) == 0:
    157 				article["uuid"] = mkuuid(now)
    158 			else:
    159 				article["uuid"] = mkuuid(article["uuid"])
    160 
    161 			# sanity checks
    162 			if "title" not in article and "text" not in article \
    163 					and "file" not in article:
    164 				continue
    165 
    166 			articles.append(article)
    167 
    168 	feed["articles"] = articles
    169 
    170 	return feed
    171 
    172 def parseatomfeed(astr):
    173 	xml = parsexml(astr)
    174 	if xml == None:
    175 		return None
    176 
    177 	feed = {}
    178 	articles = []
    179 	isrss = False
    180 	isrdf = False
    181 	now = datetime.now(pytz.utc)
    182 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
    183 
    184 	if hasattr(xml, "channel"):
    185 		if hasattr(xml, "item"):
    186 			isrdf = True
    187 			oxml = xml
    188 		xml = xml.channel
    189 		isrss = True
    190 
    191 	feed["title"] = ""
    192 	for e in ("title", "description"):
    193 		if hasattr(xml, e):
    194 			feed[e] = html.unescape(str(xml[e]))
    195 	
    196 	if hasattr(xml, "image") and hasattr(xml.image, "title"):
    197 		if "title" not in feed:
    198 			feed["title"] = html.unescape(str(xml.image.title))
    199 
    200 	if hasattr(xml, "updated"):
    201 		feed["updated"] = parseiso(xml.updated, now) 
    202 	elif hasattr(xml, "pubDate"):
    203 		feed["updated"] = parseiso(xml.pubDate, now)
    204 	elif hasattr(xml, "lastBuildDate"):
    205 		feed["updated"] = parseiso(xml.lastBuildDate, now)
    206 	else:
    207 		feed["updated"] = now
    208 
    209 	if hasattr(xml, "link"):
    210 		if "href" in xml.link.attrib:
    211 			feed["link"] = str(xml.link.attrib["href"])
    212 		else:
    213 			feed["link"] = str(xml.link)
    214 
    215 	if hasattr(xml, "webmaster"):
    216 		feed["email"] = html.unescape(str(xml.webmaster))
    217 	elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
    218 		feed["email"] = html.unescape(str(xml.owner.email))
    219 	elif hasattr(xml, "author") and hasattr(xml.author, "email"):
    220 		feed["email"] = html.unescape(str(xml.author.email))
    221 	elif hasattr(xml, "webMaster"):
    222 		feed["email"] = html.unescape(str(xml.webMaster))
    223 	elif hasattr(xml, "managingeditor"):
    224 		feed["email"] = html.unescape(str(xml.managingeditor))
    225 	elif hasattr(xml, "managingEditor"):
    226 		feed["email"] = html.unescape(str(xml.managingEditor))
    227 
    228 	if hasattr(xml, "author"):
    229 		if hasattr(xml.author, "name"):
    230 			feed["author"] = html.unescape(str(xml.author.name))
    231 		else:
    232 			feed["author"] = html.unescape(str(xml.author))
    233 	elif hasattr(xml, "creator"):
    234 		feed["author"] = html.unescape(str(xml.creator))
    235 
    236 	entryname = "entry"
    237 	if isrss == True or isrdf == True:
    238 		entryname = "item"
    239 	if isrdf == True:
    240 		xml = oxml
    241 	if hasattr(xml, entryname):
    242 		for entry in xml[entryname][:]:
    243 			article = {}
    244 			# title
    245 			if hasattr(entry, "title"):
    246 				article["title"] = html.unescape(\
    247 						str(entry["title"]))
    248 
    249 			# link
    250 			if hasattr(entry, "link"):
    251 				if "href" in entry.link.attrib:
    252 					article["link"] = str(entry.link.attrib["href"])
    253 				else:
    254 					article["link"] = str(entry.link)
    255 			elif hasattr(entry, "source"):
    256 				article["link"] = str(entry.source)
    257 
    258 			# enclosure
    259 			if hasattr(entry, "enclosure"):
    260 				if "href" in entry.enclosure.attrib:
    261 					article["file"] = \
    262 						str(entry.enclosure.attrib["href"])
    263 				elif "url" in entry.enclosure.attrib:
    264 					article["file"] = \
    265 						str(entry.enclosure.attrib["url"])
    266 				else:
    267 					article["file"] = str(entry.enclosure)
    268 
    269 			if hasattr(entry, "group") and \
    270 					hasattr(entry.group, "content"):
    271 				if "url" in entry.group.content:
    272 					article["file"] = \
    273 						html.unescape(\
    274 						str(entry.group.content.\
    275 						attrib["file"]))
    276 
    277 			# updated
    278 			try:
    279 				if hasattr(entry, "updated"):
    280 					article["updated"] = parseiso(entry.updated,\
    281 							now)
    282 				elif hasattr(entry, "temporary"):
    283 					article["updated"] = now
    284 				elif hasattr(entry, "pubDate"):
    285 					article["updated"] = parseiso(entry.pubDate,\
    286 							now)
    287 				elif hasattr(entry, "date"):
    288 					article["updated"] = parseiso(entry.date, now)
    289 				else:
    290 					article["updated"] = now
    291 			except TypeError:
    292 				# There was some error in parseiso.
    293 				article["updated"] = now
    294 
    295 			# author
    296 			if hasattr(entry, "author"):
    297 				if hasattr(entry.author, "name"):
    298 					article["author"] = html.unescape(\
    299 							str(entry.author.name))
    300 				else:
    301 					article["author"] = html.unescape(\
    302 							str(entry.author))
    303 			elif hasattr(entry, "creator"):
    304 				article["author"] = html.unescape(\
    305 						str(entry.creator))
    306 
    307 			# tags
    308 			if hasattr(entry, "category"):
    309 				article["tags"] = []
    310 				for cat in entry["category"][:]:
    311 					article["tags"].append(\
    312 							html.unescape(\
    313 							str(cat)))
    314 
    315 			# text
    316 			# Don't unescape the text, it might contain HTML.
    317 			if hasattr(entry, "encoded"):
    318 				article["text"] = str(entry.encoded)
    319 			elif hasattr(entry, "content"):
    320 				article["text"] = str(entry.content)
    321 			elif hasattr(entry, "summary"):
    322 				article["text"] = str(entry.summary)
    323 			elif hasattr(entry, "description"):
    324 				article["text"] = str(entry.description)
    325 
    326 			# id
    327 			if hasattr(entry, "id"):
    328 				article["id"] = str(entry["id"])
    329 			else:
    330 				if "link" in article:
    331 					article["id"] = article["link"]
    332 				elif "file" in article:
    333 					article["id"] = article["file"]
    334 				else:
    335 					article["id"] = article["text"][:30]
    336 
    337 			if article["updated"] == now:
    338 				article["uuid"] = ""
    339 			else:
    340 				article["uuid"] = "%s" % (article["updated"])
    341 
    342 			# Certain websites need exceptions due to their
    343 			# »programmers« being stupid.
    344 			if "link" in feed:
    345 				if "youtube.com" in feed["link"]:
    346 					article["uuid"] = ""
    347 
    348 			for e in ("id", "title", "file"):
    349 				if e in article:
    350 					article["uuid"] = "%s-%s" % \
    351 						(article["uuid"],\
    352 						 article[e])
    353 
    354 			def mkuuid(s):
    355 				return hashlib.sha256(str(s).\
    356 					encode("utf8")).hexdigest()
    357 			if len(article["uuid"]) == 0:
    358 				article["uuid"] = mkuuid(now)
    359 			else:
    360 				article["uuid"] = mkuuid(article["uuid"])
    361 
    362 			# sanity checks
    363 			if "title" not in article and "text" not in article \
    364 					and "file" not in article:
    365 				continue
    366 
    367 			articles.append(article)
    368 
    369 	try:
    370 		feed["articles"] = sorted(articles, key=lambda article: \
    371 				article["updated"])
    372 	except TypeError:
    373 		for article in articles:
    374 			print(article["updated"])
    375 
    376 	return feed
    377 
    378 def fetch(uri):
    379 	ftype = "xml"
    380 	if "file://" in uri:
    381 		fd = codecs.open(uri[7:], "r", "utf-8")
    382 		fval = fd.read().encode("utf-8")
    383 		fd.close()
    384 		rcode = 200
    385 	elif "gopher://" in uri:
    386 		urls = urllib.parse.urlparse(uri, allow_fragments=False)
    387 		if ":" in urls.netloc:
    388 			(host, port) = urls.netloc.split(":")
    389 		else:
    390 			host = urls.netloc
    391 			port = 70
    392 		if len(urls.path) > 2:
    393 			if len(urls.query) > 0:
    394 				selector = "%s?%s" % (urls.path[2:], urls.query)
    395 			else:
    396 				selector = urls.path[2:]
    397 		else:
    398 			selector = ""
    399 
    400 		s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    401 		s.connect((host, port))
    402 		s.send(("%s\r\n" % (selector)).encode("utf-8"))
    403 		fd = s.makefile("r")
    404 		fval = fd.read().encode("utf-8")
    405 		s.close()
    406 		rcode = 200
    407 	else:
    408 		fd = requests.get(uri, timeout=20,\
    409 			headers={"User-Agent": "Zeitungsschau/1.0"})
    410 		fval = fd.content
    411 		rcode = fd.status_code
    412 
    413 		if "Content-Type" in fd.headers:
    414 			if "application/json" in fd.headers["Content-Type"]:
    415 				ftype = "json"
    416 
    417 	if ftype == "xml":
    418 		suri = uri.lower().rsplit(".", 1)
    419 		if len(suri) > 1:
    420 			if suri[-1] == "json":
    421 				ftype = "json"
    422 			elif suri[-1] == "txt":
    423 				ftype = "twtxt"
    424 
    425 	if ftype == "xml":
    426 		rval = (rcode, parseatomfeed(fval))
    427 	elif ftype == "twtxt":
    428 		rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri))
    429 	else:
    430 		rval = (rcode, parsejsonfeed(fval.decode("utf-8")))
    431 	
    432 	if rval[1] != None:
    433 		rval[1]["feeduri"] = uri
    434 	
    435 	return rval
    436