zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

feed.py (10801B)


      1 #
      2 # See LICENSE for licensing details.
      3 #
      4 # Copy me if you can.
      5 # by 20h
      6 #
      7 
      8 from lxml import objectify 
      9 from lxml import etree
     10 from datetime import datetime
     11 import dateutil.parser
     12 from dateutil.tz import gettz
     13 import requests
     14 import hashlib
     15 import pytz
     16 import codecs
     17 import html
     18 import urllib.parse
     19 import socket
     20 import json
     21 import pytz
     22 
     23 def parseiso(dstr, now):
     24 	def gettzinfo(zone, offset):
     25 		try:
     26 			return gettz(zone)
     27 		except:
     28 			return None
     29 
     30 	try:
     31 		return dateutil.parser.parse(str(dstr), default=now,
     32 				tzinfos=gettzinfo)
     33 	except:
     34 		# Invalid time format. Could not be parsed.
     35 		return now
     36 
     37 def removenamespaces(xml):
     38 	for key in xml.nsmap:
     39 		nsstr = u'{%s}' % (xml.nsmap[key])
     40 		nsl = len(nsstr)
     41 
     42 		for elem in xml.getiterator():
     43 			if elem.tag.startswith(nsstr):
     44 				elem.tag = elem.tag[nsl:]
     45 
     46 def parsexml(astr):
     47 	xml = objectify.fromstring(astr)
     48 	removenamespaces(xml)
     49 	# Throw XML parsing errors so we can blame the feed authors.
     50 	return xml
     51 
     52 def parsetwtxtfeed(astr, uri):
     53 	feed = {}
     54 	articles = []
     55 	now = datetime.now(pytz.utc)
     56 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
     57 
     58 	feed["title"] = uri
     59 	feed["link"] = uri
     60 	feed["updated"] = now
     61 
     62 	lines = astr.split("\n");
     63 	for line in lines:
     64 		# People already reinterpret the standard. :(
     65 		if len(line) == 0:
     66 			continue
     67 		if line[0] == "#":
     68 			continue
     69 
     70 		createdtxt, ltext = line.split("\t", 1)
     71 		created = parseiso(createdtxt, now)
     72 
     73 		article = {}
     74 		article["id"] = createdtxt
     75 		article["title"] = ltext
     76 		article["text"] = ltext
     77 		article["uuid"] = createdtxt
     78 		article["updated"] = created
     79 
     80 		if article["updated"] == now:
     81 			article["uuid"] = ""
     82 		else:
     83 			article["uuid"] = "%s" % (article["updated"])
     84 
     85 		articles.append(article)
     86 
     87 	feed["articles"] = articles
     88 
     89 	return feed
     90 
     91 def parsejsonfeed(astr):
     92 	js = json.loads(astr)
     93 
     94 	feed = {}
     95 	articles = []
     96 	now = datetime.now(pytz.utc)
     97 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
     98 
     99 	if "title" in js:
    100 		feed["title"] = js["title"]
    101 	if "description" in js:
    102 		feed["description"] = js["description"]
    103 	if "home_page_url" in js:
    104 		feed["link"] = js["home_page_url"]
    105 	if "feed_url" in js:
    106 		feed["link"] = js["feed_url"]
    107 	if "author" in js:
    108 		if "name" in js["author"]:
    109 			feed["author"] = js["author"]["name"]
    110 	feed["updated"] = now
    111 
    112 	if "items" in js:
    113 		for item in js["items"]:
    114 			article = {}
    115 			if "url" in item:
    116 				article["file"] = item["url"]
    117 			if "title" in item:
    118 				article["title"] = item["title"]
    119 			if "id" in item:
    120 				article["id"] = item["id"]
    121 			else:
    122 				if "link" in article:
    123 					article["id"] = article["link"]
    124 				elif "file" in article:
    125 					article["id"] = article["file"]
    126 				else:
    127 					article["id"] = article["text"][:30]
    128 
    129 			if "summary" in item:
    130 				article["text"] = html.unescape(item["summary"])
    131 			if "content_html" in item:
    132 				article["text"] = html.unescape(item["content_html"])
    133 			if "content_text" in item:
    134 				article["text"] = html.unescape(item["content_text"])
    135 			if "date_published" in item:
    136 				article["updated"] = \
    137 					dateutil.parser.parse(item["date_published"])
    138 			else:
    139 				article["updated"] = now
    140 
    141 			if article["updated"] == now:
    142 				article["uuid"] = ""
    143 			else:
    144 				article["uuid"] = "%s" % (article["updated"])
    145 
    146 			for e in ("id", "title", "file"):
    147 				if e in article:
    148 					article["uuid"] = "%s-%s" % \
    149 						(article["uuid"],\
    150 						 article[e])
    151 
    152 			def mkuuid(s):
    153 				return hashlib.sha256(str(s).\
    154 					encode("utf8")).hexdigest()
    155 			if len(article["uuid"]) == 0:
    156 				article["uuid"] = mkuuid(now)
    157 			else:
    158 				article["uuid"] = mkuuid(article["uuid"])
    159 
    160 			# sanity checks
    161 			if "title" not in article and "text" not in article \
    162 					and "file" not in article:
    163 				continue
    164 
    165 			articles.append(article)
    166 
    167 	feed["articles"] = articles
    168 
    169 	return feed
    170 
    171 def parseatomfeed(astr):
    172 	xml = parsexml(astr)
    173 	if xml == None:
    174 		return None
    175 
    176 	feed = {}
    177 	articles = []
    178 	isrss = False
    179 	isrdf = False
    180 	now = datetime.now(pytz.utc)
    181 	now = now.replace(hour=20, minute=20, second=20, microsecond=20)
    182 
    183 	if hasattr(xml, "channel"):
    184 		if hasattr(xml, "item"):
    185 			isrdf = True
    186 			oxml = xml
    187 		xml = xml.channel
    188 		isrss = True
    189 
    190 	feed["title"] = ""
    191 	for e in ("title", "description"):
    192 		if hasattr(xml, e):
    193 			feed[e] = html.unescape(str(xml[e]))
    194 	
    195 	if hasattr(xml, "image") and hasattr(xml.image, "title"):
    196 		if "title" not in feed:
    197 			feed["title"] = html.unescape(str(xml.image.title))
    198 
    199 	if hasattr(xml, "updated"):
    200 		feed["updated"] = parseiso(xml.updated, now) 
    201 	elif hasattr(xml, "pubDate"):
    202 		feed["updated"] = parseiso(xml.pubDate, now)
    203 	elif hasattr(xml, "lastBuildDate"):
    204 		feed["updated"] = parseiso(xml.lastBuildDate, now)
    205 	else:
    206 		feed["updated"] = now
    207 
    208 	if hasattr(xml, "link"):
    209 		if "href" in xml.link.attrib:
    210 			feed["link"] = str(xml.link.attrib["href"])
    211 		else:
    212 			feed["link"] = str(xml.link)
    213 
    214 	if hasattr(xml, "webmaster"):
    215 		feed["email"] = html.unescape(str(xml.webmaster))
    216 	elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
    217 		feed["email"] = html.unescape(str(xml.owner.email))
    218 	elif hasattr(xml, "author") and hasattr(xml.author, "email"):
    219 		feed["email"] = html.unescape(str(xml.author.email))
    220 	elif hasattr(xml, "webMaster"):
    221 		feed["email"] = html.unescape(str(xml.webMaster))
    222 	elif hasattr(xml, "managingeditor"):
    223 		feed["email"] = html.unescape(str(xml.managingeditor))
    224 	elif hasattr(xml, "managingEditor"):
    225 		feed["email"] = html.unescape(str(xml.managingEditor))
    226 
    227 	if hasattr(xml, "author"):
    228 		if hasattr(xml.author, "name"):
    229 			feed["author"] = html.unescape(str(xml.author.name))
    230 		else:
    231 			feed["author"] = html.unescape(str(xml.author))
    232 	elif hasattr(xml, "creator"):
    233 		feed["author"] = html.unescape(str(xml.creator))
    234 
    235 	entryname = "entry"
    236 	if isrss == True or isrdf == True:
    237 		entryname = "item"
    238 	if isrdf == True:
    239 		xml = oxml
    240 	if hasattr(xml, entryname):
    241 		for entry in xml[entryname][:]:
    242 			article = {}
    243 			# title
    244 			if hasattr(entry, "title"):
    245 				article["title"] = html.unescape(\
    246 						str(entry["title"]))
    247 
    248 			# link
    249 			if hasattr(entry, "link"):
    250 				if "href" in entry.link.attrib:
    251 					article["link"] = str(entry.link.attrib["href"])
    252 				else:
    253 					article["link"] = str(entry.link)
    254 			elif hasattr(entry, "source"):
    255 				article["link"] = str(entry.source)
    256 
    257 			# enclosure
    258 			if hasattr(entry, "enclosure"):
    259 				if "href" in entry.enclosure.attrib:
    260 					article["file"] = \
    261 						str(entry.enclosure.attrib["href"])
    262 				elif "url" in entry.enclosure.attrib:
    263 					article["file"] = \
    264 						str(entry.enclosure.attrib["url"])
    265 				else:
    266 					article["file"] = str(entry.enclosure)
    267 
    268 			if hasattr(entry, "group") and \
    269 					hasattr(entry.group, "content"):
    270 				if "url" in entry.group.content:
    271 					article["file"] = \
    272 						html.unescape(\
    273 						str(entry.group.content.\
    274 						attrib["file"]))
    275 
    276 			# updated
    277 			try:
    278 				if hasattr(entry, "updated"):
    279 					article["updated"] = parseiso(entry.updated,\
    280 							now)
    281 				elif hasattr(entry, "temporary"):
    282 					article["updated"] = now
    283 				elif hasattr(entry, "pubDate"):
    284 					article["updated"] = parseiso(entry.pubDate,\
    285 							now)
    286 				elif hasattr(entry, "date"):
    287 					article["updated"] = parseiso(entry.date, now)
    288 				else:
    289 					article["updated"] = now
    290 			except TypeError:
    291 				# There was some error in parseiso.
    292 				article["updated"] = now
    293 
    294 			# author
    295 			if hasattr(entry, "author"):
    296 				if hasattr(entry.author, "name"):
    297 					article["author"] = html.unescape(\
    298 							str(entry.author.name))
    299 				else:
    300 					article["author"] = html.unescape(\
    301 							str(entry.author))
    302 			elif hasattr(entry, "creator"):
    303 				article["author"] = html.unescape(\
    304 						str(entry.creator))
    305 
    306 			# tags
    307 			if hasattr(entry, "category"):
    308 				article["tags"] = []
    309 				for cat in entry["category"][:]:
    310 					article["tags"].append(\
    311 							html.unescape(\
    312 							str(cat)))
    313 
    314 			# text
    315 			# Don't unescape the text, it might contain HTML.
    316 			if hasattr(entry, "encoded"):
    317 				article["text"] = str(entry.encoded)
    318 			elif hasattr(entry, "content"):
    319 				article["text"] = str(entry.content)
    320 			elif hasattr(entry, "summary"):
    321 				article["text"] = str(entry.summary)
    322 			elif hasattr(entry, "description"):
    323 				article["text"] = str(entry.description)
    324 
    325 			# id
    326 			if hasattr(entry, "id"):
    327 				article["id"] = str(entry["id"])
    328 			else:
    329 				if "link" in article:
    330 					article["id"] = article["link"]
    331 				elif "file" in article:
    332 					article["id"] = article["file"]
    333 				else:
    334 					article["id"] = article["text"][:30]
    335 
    336 			if article["updated"] == now:
    337 				article["uuid"] = ""
    338 			else:
    339 				article["uuid"] = "%s" % (article["updated"])
    340 
    341 			# Certain websites need exceptions due to their
    342 			# »programmers« being stupid.
    343 			if "link" in feed:
    344 				if "youtube.com" in feed["link"]:
    345 					article["uuid"] = ""
    346 
    347 			for e in ("id", "title", "file"):
    348 				if e in article:
    349 					article["uuid"] = "%s-%s" % \
    350 						(article["uuid"],\
    351 						 article[e])
    352 
    353 			def mkuuid(s):
    354 				return hashlib.sha256(str(s).\
    355 					encode("utf8")).hexdigest()
    356 			if len(article["uuid"]) == 0:
    357 				article["uuid"] = mkuuid(now)
    358 			else:
    359 				article["uuid"] = mkuuid(article["uuid"])
    360 
    361 			# sanity checks
    362 			if "title" not in article and "text" not in article \
    363 					and "file" not in article:
    364 				continue
    365 
    366 			articles.append(article)
    367 
    368 	try:
    369 		feed["articles"] = sorted(articles, key=lambda article: \
    370 				article["updated"])
    371 	except TypeError:
    372 		for article in articles:
    373 			print(article["updated"])
    374 
    375 	return feed
    376 
    377 def fetch(uri):
    378 	ftype = "xml"
    379 	if "file://" in uri:
    380 		fd = codecs.open(uri[7:], "r", "utf-8")
    381 		fval = fd.read().encode("utf-8")
    382 		fd.close()
    383 		rcode = 200
    384 	elif "gopher://" in uri:
    385 		urls = urllib.parse.urlparse(uri, allow_fragments=False)
    386 		if ":" in urls.netloc:
    387 			(host, port) = urls.netloc.split(":")
    388 		else:
    389 			host = urls.netloc
    390 			port = 70
    391 		if len(urls.path) > 2:
    392 			if len(urls.query) > 0:
    393 				selector = "%s?%s" % (urls.path[2:], urls.query)
    394 			else:
    395 				selector = urls.path[2:]
    396 		else:
    397 			selector = ""
    398 
    399 		s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    400 		s.connect((host, port))
    401 		s.send(("%s\r\n" % (selector)).encode("utf-8"))
    402 		s.shutdown(1)
    403 		fd = s.makefile("r")
    404 		fval = fd.read().encode("utf-8")
    405 		s.close()
    406 		rcode = 200
    407 	else:
    408 		fd = requests.get(uri, timeout=20,\
    409 			headers={"User-Agent": "Zeitungsschau/1.0"})
    410 		fval = fd.content
    411 		rcode = fd.status_code
    412 
    413 		if "Content-Type" in fd.headers:
    414 			if "application/json" in fd.headers["Content-Type"]:
    415 				ftype = "json"
    416 
    417 	if ftype == "xml":
    418 		suri = uri.lower().rsplit(".", 1)
    419 		if len(suri) > 1:
    420 			if suri[-1] == "json":
    421 				ftype = "json"
    422 			elif suri[-1] == "txt":
    423 				ftype = "twtxt"
    424 
    425 	if ftype == "xml":
    426 		rval = (rcode, parseatomfeed(fval))
    427 	elif ftype == "twtxt":
    428 		rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri))
    429 	else:
    430 		rval = (rcode, parsejsonfeed(fval.decode("utf-8")))
    431 	
    432 	if rval[1] != None:
    433 		rval[1]["feeduri"] = uri
    434 	
    435 	return rval
    436