eomyidae

a gopher crawler software
git clone git://r-36.net/eomyidae
Log | Files | Refs | README | LICENSE

eomyidae (15243B)


      1 #!/usr/bin/env python
      2 # coding=utf-8
      3 #
      4 # See the LICENSE file for details.
      5 #
      6 
      7 import os
      8 import sys
      9 import getopt
     10 import urllib.parse
     11 import socket
     12 import io
     13 import pickle
     14 import time
     15 import hashlib
     16 import errno
     17 import random
     18 import operator
     19 import math
     20 from multiprocessing import Pool
     21 from datetime import datetime
     22 from datetime import timedelta
     23 
     24 def parseuri(uri):
     25 	urls = urllib.parse.urlparse(uri, allow_fragments=False)
     26 	if ":" in urls.netloc:
     27 		(host, port) = urls.netloc.split(":")[:2]
     28 	else:
     29 		host = urls.netloc
     30 		port = 70
     31 
     32 	mtype = "1"
     33 	if len(urls.path) > 1:
     34 		mtype = urls.path[1]
     35 
     36 	if len(urls.path) > 2:
     37 		if len(urls.query) > 0:
     38 			selector = "%s?%s" % (urls.path[2:], urls.query)
     39 		else:
     40 			selector = urls.path[2:]
     41 	else:
     42 		selector = ""
     43 
     44 	return (host, port, mtype, selector) 
     45 
     46 def poolgopher(req):
     47 	data = gopher(req[0], req[1], req[2], req[3])
     48 	req.append(data)
     49 	return req 
     50 
     51 def gopher(uri=None, host=None, port=70, selector=""):
     52 	#print("gopher(uri = %s, host = %s, port = %d, selector = %s)" % \
     53 	#		(uri, host, port, selector))
     54 	if uri != None:
     55 		(host, port, mtype, selector) = parseuri(uri)
     56 		port = int(port)
     57 
     58 	s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     59 	s.settimeout(20)
     60 	try:
     61 		s.connect((host, port))
     62 	except socket.gaierror:
     63 		return ""
     64 	except socket.timeout:
     65 		return ""
     66 	except TimeoutError:
     67 		return ""
     68 	except ConnectionResetError:
     69 		return ""
     70 	except OverflowError:
     71 		return ""
     72 	except OSError as e:
     73 		# No route to host.
     74 		if e.errno == 113:
     75 			return ""
     76 
     77 	try:
     78 		s.send(("%s\r\n" % (selector)).encode("utf-8"))
     79 	except BrokenPipeError:
     80 		return ""
     81 
     82 	fd = s.makefile("b")
     83 	try:
     84 		data = fd.read()
     85 	except socket.timeout:
     86 		fd.close()
     87 		return ""
     88 	except ConnectionResetError:
     89 		fd.close()
     90 		return ""
     91 	fd.close()
     92 
     93 	try:
     94 		content = data.decode(errors='replace')
     95 	except UnicodeDecodeError:
     96 		content = data.decode("iso-8859-1")
     97 
     98 	return content
     99 
    100 def parsemenu(data):
    101 	menu = []
    102 	lines = data.split("\n")
    103 	for line in lines:
    104 		line = line.strip()
    105 		if len(line) < 1:
    106 			continue
    107 
    108 		mtype = line[0]
    109 
    110 		# Last entry
    111 		if mtype == ".":
    112 			break
    113 
    114 		elements = line[1:].split("\t")
    115 		if len(elements) < 4:
    116 			continue
    117 		(description, selector, host, port) = elements[:4]
    118 		menu.append([mtype, description, selector, host, port])
    119 
    120 	return menu
    121 
    122 def menu2text(menu):
    123 	text = ""
    124 	for entry in menu:
    125 		if type(entry[1]) != str:
    126 				continue
    127 
    128 		text += "%s\n" % (entry[1])
    129 	
    130 	return text
    131 
    132 ## Robots.txt
    133 # https://en.wikipedia.org/wiki/Robots.txt
    134 # # Comment
    135 # User-agent: somebot
    136 # Disallow: /path
    137 # Allow: /path
    138 # Crawl-delay: seconds
    139 def parserobots(data):
    140 	robots = []
    141 	lines = data.split("\n")
    142 	for line in lines:
    143 		line = line.strip()
    144 		if "#" in line:
    145 			(line, comment) = line.split("#", 1)
    146 		if len(line) < 0:
    147 			# Empty line, needed for bot-specific rules.
    148 			robots.append(["",""])
    149 			continue
    150 		if not ":" in line:
    151 			continue
    152 
    153 		(header, value) = line.strip().split(":", 1)
    154 		value = value.strip().lower()
    155 		header = header.strip().lower()
    156 		robots.append([header, value])
    157 	return robots
    158 
    159 def adaptrobots(robotsdata):
    160 	filterlines = {}
    161 	robotslines = parserobots(robotsdata)
    162 	i = 0
    163 
    164 	allowlines = []
    165 	disallowlines = []
    166 	otherlines = []
    167 	iseomyidae = False
    168 	while i < len(robotslines):
    169 		header = robotslines[i][0].lower()
    170 		value = robotslines[i][1]
    171 		if header == "user-agent":
    172 			ua = value.split("/")
    173 			if ua[0] == "eomyidae" or ua[0] == "*":
    174 				iseomyidae = 1
    175 			else:
    176 				iseomyidae = 0
    177 		elif header == "allow" and iseomyidae == True:
    178 			allowlines.append(value)
    179 		elif header == "disallow" and iseomyidae == True:
    180 			disallowlines.append(value)
    181 		elif header == "":
    182 			iseomyidae = False
    183 		else:
    184 			if iseomyidae == True:
    185 				otherlines.append([header, value])
    186 		i += 1
    187 
    188 	filterlines["allow"] = allowlines
    189 	filterlines["disallow"] = disallowlines
    190 	filterlines["other"] = otherlines
    191 	if len(allowlines) > 0 or len(disallowlines) > 0 \
    192 			or len(otherlines) > 0:
    193 		filterlines["empty"] = False
    194 	else:
    195 		filterlines["empty"] = True
    196 	
    197 	return filterlines
    198 
    199 def mkpath(cachepath):
    200 	try:
    201 		os.makedirs(cachepath)
    202 	except OSError as e:
    203 		if e.errno != errno.EEXIST:
    204 			raise
    205 
    206 def mkopen(cachefile):
    207 	if not os.path.exists(cachefile):
    208 		fd = open(cachefile, "xb")
    209 	else:
    210 		fd = open(cachefile, "wb")
    211 	return fd
    212 
    213 def informserveradmin(uri, host=None, port=70):
    214 	if host == None:
    215 		(host, port, mtype, selector) = parseuri(uri)
    216 		port = int(port)
    217 
    218 	# We are nice and inform before every robots.txt, how to contact us.
    219 	gopher(host=host, port=port, selector="This is eomyidae, your "
    220 			"friendly crawler. See "
    221 			"gopher://gopherproject.org/1/eomyidae for "
    222 			"more info. Have a nice day!")
    223 
    224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \
    225 		filtercache=None):
    226 	if host == None:
    227 		(host, port, mtype, selector) = parseuri(uri)
    228 		port = int(port)
    229 
    230 	if filtercache != None and host in filtercache:
    231 		#print("Got filterlines from memory filtercache.")
    232 		return filtercache[host]
    233 
    234 	print("Getting robots for %s:%d" % (host, port))
    235 
    236 	cachepath = "%s/%s:%d" % (cachedir, host, port)
    237 	mkpath(cachepath)
    238 
    239 	cacherobotstxt = "%s/robots.txt" % (cachepath)
    240 	cacherobotspickle = "%s/robots.pickle" % (cachepath)
    241 	filterlines = {}
    242 	if not os.path.exists(cacherobotstxt) or force == True:
    243 		# Be nice.
    244 		informserveradmin(uri=uri, host=host, port=port)
    245 
    246 		robotsdata = gopher(host=host, port=port, selector="/robots.txt")
    247 		print("Got new robots.txt.")
    248 		print(robotsdata)
    249 		robotstxtfd = mkopen(cacherobotstxt)
    250 		robotstxtfd.write(robotsdata.encode())
    251 		robotstxtfd.close()
    252 
    253 		filterlines = adaptrobots(robotsdata)
    254 		# Do not store if there is nothing, so we save I/O later.
    255 		if filterlines["empty"] == False:
    256 			print("Storing filterlines.")
    257 			storelistdb(cacherobotspickle, filterlines)
    258 
    259 	else:
    260 		if os.path.exists(cacherobotspickle):
    261 			#print("Loading filterlines from cache.")
    262 			filterlines = loadlistdb(cacherobotspickle)
    263 		else:
    264 			#print("No filterlines available in cache.")
    265 			filterlines["empty"] = True
    266 
    267 	#print(filterlines)
    268 	if filtercache != None:
    269 		filtercache[host] = filterlines
    270 
    271 	return filterlines
    272 
    273 def selectorisallowed(filterlines, selector):
    274 	if filterlines["empty"] == True:
    275 		return True
    276 
    277 	def robotsmatch(pattern, selector):
    278 		#print("pattern = %s, selector = %s" % (pattern, selector))
    279 		if pattern == '*':
    280 			#print("Just start match.")
    281 			return True
    282 		elif pattern[0] == '*':
    283 			#print("Begins with star.")
    284 			if pattern[-1] == '*':
    285 				#print("Begins and ends with star.")
    286 				if pattern[1:-1] in selector:
    287 					#print("Matches.")
    288 					return True
    289 				else:
    290 					return False
    291 			else:
    292 				return selector.endswith(pattern[1:])
    293 		elif pattern[-1] == '*':
    294 			#print("Ends with star.")
    295 			return selector.startswith(pattern[:-1])
    296 		else:
    297 			return selector.startswith(pattern)
    298 
    299 	isallowed = True
    300 	for line in filterlines["disallow"]:
    301 		# TODO: Should this be match everything?
    302 		if len(line) == 0:
    303 			continue
    304 		if robotsmatch(line, selector) == True:
    305 			#print("isallowed = False")
    306 			isallowed = False
    307 	for line in filterlines["allow"]:
    308 		# TODO: Should this be match everything?
    309 		if len(line) == 0:
    310 			continue
    311 		if robotsmatch(line, selector) == True:
    312 			#print("isallowed = True")
    313 			isallowed = True
    314 
    315 	#print("isallowed = %d" % (isallowed))
    316 	return isallowed
    317 
    318 def loadselectorstxt(filename):
    319 	selectors = []
    320 
    321 	if os.path.exists(filename):
    322 		fd = open(filename, "r")
    323 		for line in fd:
    324 			fields = line.split("|")
    325 			selectors.append(fields)
    326 		fd.close()
    327 	
    328 	return selectors
    329 
    330 def loadlist(filename):
    331 	listelems = []
    332 
    333 	if os.path.exists(filename):
    334 		fd = open(filename, "r")
    335 		for line in fd:
    336 			line = line.strip()
    337 			if len(line) == 0:
    338 				continue
    339 			if line[0] == "#":
    340 				continue
    341 			listelems.append(line)
    342 		fd.close()
    343 	
    344 	return listelems
    345 
    346 def loadlistdb(filename):
    347 	listelems = []
    348 
    349 	if os.path.exists(filename):
    350 		fd = open(filename, "rb")
    351 		try:
    352 			listelems = pickle.load(fd)
    353 		except EOFError:
    354 			return []
    355 		fd.close()
    356 	
    357 	return listelems
    358 
    359 def storelistdb(filename, listelems):
    360 	fd = mkopen(filename)
    361 	pickle.dump(listelems, fd)
    362 	fd.close()
    363 
    364 def storerawdata(cachedir, uri, data, host=None, port=70):
    365 	if host == None:
    366 		(host, port, mtype, selector) = parseuri(uri)
    367 		port = int(port)
    368 
    369 	cachepath = "%s/%s:%s" % (cachedir, host, port)
    370 	mkpath(cachepath)
    371 
    372 	m = hashlib.sha256()
    373 	m.update(uri.encode())
    374 	urihash = m.hexdigest()
    375 
    376 	cachepath = "%s/%s.menu" % (cachepath, urihash)
    377 	fd = mkopen(cachepath)
    378 	#print("Storing %s at %s" % (uri, cachepath))
    379 	fd.write(("%s\n" % (uri)).encode())
    380 	fd.write(data.encode())
    381 	fd.close()
    382 
    383 def usage(app):
    384 	app = os.path.basename(app)
    385 	print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [starturl]" % (app), file=sys.stderr)
    386 	sys.exit(1)
    387 
    388 def main(args):
    389 	try:
    390 		opts, largs = getopt.getopt(args[1:], "hb:f:ow:r")
    391 	except getopt.GetoptError as err:
    392 		print(str(err))
    393 		usage(args[0])
    394 
    395 	blocklistfile = None
    396 	blocklist = []
    397 
    398 	base = "."
    399 	starturi = None
    400 	workernum = 1
    401 	robotscache = {}
    402 	forcehostscount = False
    403 	for o, a in opts:
    404 		if o == "-h":
    405 			usage(args[0])
    406 		elif o == "-b":
    407 			base = a
    408 		elif o == "-f":
    409 			blocklistfile = a
    410 			blocklist = loadlist(blocklistfile)
    411 			print("blocklist: %s" % (blocklist))
    412 		elif o == "-o":
    413 			forcehostscount = True
    414 		elif o == "-r":
    415 			# Do not cache robots.txt in memory.
    416 			robotscache = None
    417 		elif o == "-w":
    418 			try:
    419 				workernum = int(a)
    420 			except ValueError:
    421 				workernum = 1
    422 		else:
    423 			assert False, "unhandled option"
    424 
    425 	os.chdir(base)
    426 	cachedir = "%s/cache" % (base)
    427 
    428 	if len(largs) > 0:
    429 		starturi = largs[0]
    430 
    431 	knownuris = loadlistdb("knownuris.pickle")
    432 	if knownuris == []:
    433 		knownuris = {}
    434 	lastlenknownuris = len(knownuris)
    435 
    436 	def isblocked(uri):
    437 		for rule in blocklist:
    438 			if uri.startswith(rule):
    439 				return True
    440 		return False
    441 
    442 	def addhostscount(host):
    443 		if host in hostscount:
    444 			hostscount[host] += 1
    445 		else:
    446 			hostscount[host] = 1
    447 
    448 	def subhostscount(host):
    449 		if host in hostscount:
    450 			hostscount[host] -= 1
    451 			if hostscount[host] <= 0:
    452 				del hostscount[host]
    453 
    454 	def addhostscache(uri, host=None, port=70, selector="/"):
    455 		if uri != None and host == None:
    456 			(host, port, mtype, selector) = parseuri(uri)
    457 			port = int(port)
    458 		else:
    459 			try:
    460 				port = int(port)
    461 			except ValueError:
    462 				return
    463 
    464 		if uri in knownuris:
    465 			print("ignored for queue: %s" % (uri))
    466 			return
    467 		if host == "":
    468 			print("ignored for queue: %s" % (uri))
    469 			return
    470 		if isblocked(uri):
    471 			print("blocked by filters: %s" % (uri))
    472 			return
    473 
    474 		addhostscount(host)
    475 
    476 		if not host in hostscache:
    477 			hostscache[host] = {}
    478 		if not "queue" in hostscache[host]:
    479 			hostscache[host]["queue"] = {}
    480 
    481 		filterrules = cacherobots(cachedir, uri, \
    482 				host=host, \
    483 				port=port, \
    484 				filtercache=robotscache)
    485 		if selectorisallowed(filterrules, selector) == True:
    486 			hostscache[host]["queue"][uri] = None
    487 			print("pushed to queue: %s" % (uri))
    488 		else:
    489 			pass
    490 			print("blocked by robots: %s" % (uri))
    491 
    492 	def getqueuelen():
    493 		queuelen = 0
    494 		for host in hostscache:
    495 			queuelen += len(hostscache[host]["queue"])
    496 		return queuelen
    497 
    498 	hostscache = loadlistdb("hostscache.pickle")
    499 	if hostscache == []:
    500 		hostscache = {}
    501 	hostscount = loadlistdb("hostscount.pickle")
    502 	if hostscount == [] or forcehostscount == True:
    503 		hostscount = {}
    504 		for host in list(hostscache.keys()):
    505 			print("host = %s, queuelen = %d" \
    506 					% (host, \
    507 					   len(hostscache[host]["queue"])))
    508 			if len(hostscache[host]["queue"]) == 0:
    509 				del hostscache[host]
    510 				continue
    511 			for uri in hostscache[host]["queue"]:
    512 				(host, port, mtype, selector) = parseuri(uri)
    513 				addhostscount(host)
    514 
    515 	def storestate():
    516 		if blocklistfile != None:
    517 			blocklist = loadlist(blocklistfile)
    518 			if len(blocklist) > 0:
    519 				print("blocklist: %s" % (blocklist))
    520 		print("################## Storing state to disc.")
    521 		storelistdb("knownuris.pickle", knownuris)
    522 		storelistdb("hostscache.pickle", hostscache)
    523 		storelistdb("hostscount.pickle", hostscount)
    524 		print("################## Storing state to disc done.")
    525 
    526 	jobs = []
    527 	if starturi != None:
    528 		#print("starturi = %s" % (starturi))
    529 		if not isblocked(starturi):
    530 			(starthost, startport, startmtype, startselector) = parseuri(starturi)
    531 			addhostscache(starturi, \
    532 					selector=startselector, \
    533 					host=starthost, \
    534 					port=startport)
    535 			try:
    536 				jobs.append([starturi, starthost, int(startport), startselector])
    537 			except ValueError:
    538 				# Please fix your URI.
    539 				pass
    540 
    541 	# Store state keeper.
    542 	startnow = datetime.now()
    543 	storedelta = timedelta(seconds=10) # 30 seconds
    544 
    545 	lastlenknownhosts = len(hostscache)
    546 	lastlenuriqueue = getqueuelen()
    547 	while lastlenuriqueue > 0:
    548 		if len(jobs) < workernum:
    549 			for host in list(hostscache.keys()):
    550 				if len(hostscache[host]["queue"]) == 0:
    551 					del hostscache[host]
    552 					if host in hostscount:
    553 						del hostscount[host]
    554 
    555 			selhosts = sorted(hostscount.items(), \
    556 					key=operator.itemgetter(1))[:workernum*2]
    557 
    558 			# Give hosts with many selectors more jobs.
    559 			hostjobs = {}
    560 			for selhost in selhosts:
    561 				# 10 ** x
    562 				hostjobs[selhost[0]] = \
    563 					math.floor(math.log10(selhost[1]))
    564 				if hostjobs[selhost[0]] == 0:
    565 					hostjobs[selhost[0]] = 1
    566 			print("Queue Status: %s" % (hostjobs))
    567 
    568 			for selhost in selhosts:
    569 				selhost = selhost[0]
    570 				seluris = hostscache[selhost]["queue"]
    571 				while hostjobs[selhost] > 0:
    572 					if len(seluris) == 0:
    573 						break
    574 					jobitem = seluris.popitem()
    575 					if isblocked(jobitem[0]):
    576 						continue
    577 					(host, port, mtype, selector) = parseuri(jobitem[0])
    578 					job = [jobitem[0], host, port, selector]
    579 					if job not in jobs:
    580 						jobs.append([jobitem[0], host, port, selector])
    581 					hostjobs[selhost] -= 1
    582 
    583 		print("Getting %d jobs." % (len(jobs)))
    584 
    585 		dataresults = []
    586 		with Pool(processes=workernum) as pool:
    587 			dataresults = pool.map(poolgopher, jobs)
    588 			#data = gopher(host=host, port=port, selector=selector)
    589 		jobs = []
    590 
    591 		for dataresult in dataresults:
    592 			(cururi, host, port, selector, data) = dataresult
    593 			subhostscount(host)
    594 			storerawdata(cachedir, cururi, data, host=host, port=port)
    595 			menudata = parsemenu(data)
    596 			#print(menudata)
    597 			for mi in menudata:
    598 				# Only menus so far.
    599 				if mi[0] == "1":
    600 					# Fix menu items with ports in hosts. 
    601 					if ":" in mi[3]:
    602 						mi[3] = mi[3].split(":")[0]
    603 
    604 					guri =  "gopher://%s:%s/%s%s" % \
    605 							(mi[3], mi[4], mi[0], mi[2])
    606 
    607 					addhostscache(guri, host=mi[3], \
    608 							port=mi[4], \
    609 							selector=mi[2])
    610 
    611 			print("Uri %s done." % (cururi))
    612 			knownuris[cururi] = None
    613 
    614 		lenuriqueue = getqueuelen()
    615 		lenknownuris = len(knownuris)
    616 		lenknownhosts = len(hostscache)
    617 		print("> queue hosts = %d (%d) %s" % \
    618 				(lenknownhosts, lenknownhosts -
    619 					lastlenknownhosts, hostscache.keys()))
    620 		print("> uri queue len = %d (%d)" % \
    621 				(lenuriqueue, lenuriqueue - lastlenuriqueue))
    622 		print("> visited uris = %d (%d)" % \
    623 				(lenknownuris, lenknownuris - lastlenknownuris))
    624 		lastlenknownuris = lenknownuris
    625 		lastlenuriqueue = lenuriqueue
    626 		lastlenknownhosts = lenknownhosts
    627 
    628 		# TODO: Remove after debugging
    629 		nowdelta = datetime.now() - startnow
    630 		if nowdelta >= storedelta:
    631 			storestate()
    632 			startnow = datetime.now()
    633 
    634 		time.sleep(0.2) # don't be too harsh on servers
    635 
    636 		#break #oneshot
    637 
    638 	# Save at end of even single shot.
    639 	storestate()
    640 
    641 	return 0
    642 
    643 if __name__ == "__main__":
    644 	sys.exit(main(sys.argv))
    645