commit 29cd7839e600acdd21378256d73b4703f799f04a
parent 0dac4a637d7e25983b563286bb0539d53ddf8d3e
Author: Christoph Lohmann <20h@r-36.net>
Date: Mon, 12 Aug 2019 11:48:12 +0200
Optimize savehostscache.
Diffstat:
eomyidae | | | 49 | ++++++++++++++++++++++++++++++++----------------- |
1 file changed, 32 insertions(+), 17 deletions(-)
diff --git a/eomyidae b/eomyidae
@@ -429,6 +429,8 @@ def main(args):
starturi = largs[0]
knownuris = loadlistdb("knownuris.pickle")
+ if knownuris == []:
+ knownuris = {}
lastlenknownuris = len(knownuris)
def isblocked(uri):
@@ -449,38 +451,43 @@ def main(args):
if hostscount[host] <= 0:
del hostscount[host]
- def addhostscache(host, uri, port=70):
+ def addhostscache(uri, host=None, port=70, selector="/"):
+ if uri != None and host == None:
+ (host, port, mtype, selector) = parseuri(uri)
+ port = int(port)
+ else:
+ try:
+ port = int(port)
+ except ValueError:
+ return
+
if uri in knownuris:
- #print("ignored for queue: %s" % (uri))
+ print("ignored for queue: %s" % (uri))
return
if host == "":
- #print("ignored for queue: %s" % (uri))
+ print("ignored for queue: %s" % (uri))
return
if isblocked(uri):
print("blocked by filters: %s" % (uri))
return
- try:
- port = int(port)
- except ValueError:
- return
-
addhostscount(host)
+ if not host in hostscache:
+ hostscache[host] = {}
+ if not "queue" in hostscache[host]:
+ hostscache[host]["queue"] = {}
+
filterrules = cacherobots(cachedir, uri, \
host=host, \
port=port, \
filtercache=robotscache)
if selectorisallowed(filterrules, selector) == True:
- if not host in hostscache:
- hostscache[host] = {}
- if not "queue" in hostscache[host]:
- hostscache[host]["queue"] = {}
hostscache[host]["queue"][uri] = None
- #print("pushed to queue: %s" % (uri))
+ print("pushed to queue: %s" % (uri))
else:
pass
- #print("blocked by robots: %s" % (uri))
+ print("blocked by robots: %s" % (uri))
def getqueuelen():
queuelen = 0
@@ -518,9 +525,13 @@ def main(args):
jobs = []
if starturi != None:
+ #print("starturi = %s" % (starturi))
if not isblocked(starturi):
(starthost, startport, startmtype, startselector) = parseuri(starturi)
- addhostscache(hostscache, starthost, starturi)
+ addhostscache(starturi, \
+ selector=startselector, \
+ host=starthost, \
+ port=startport)
try:
jobs.append([starturi, starthost, int(startport), startselector])
except ValueError:
@@ -564,7 +575,9 @@ def main(args):
if isblocked(jobitem[0]):
continue
(host, port, mtype, selector) = parseuri(jobitem[0])
- jobs.append([jobitem[0], host, port, selector])
+ job = [jobitem[0], host, port, selector]
+ if job not in jobs:
+ jobs.append([jobitem[0], host, port, selector])
hostjobs[selhost] -= 1
print("Getting %d jobs." % (len(jobs)))
@@ -591,7 +604,9 @@ def main(args):
guri = "gopher://%s:%s/%s%s" % \
(mi[3], mi[4], mi[0], mi[2])
- addhostscache(mi[3], guri, port=mi[4])
+ addhostscache(guri, host=mi[3], \
+ port=mi[4], \
+ selector=mi[2])
print("Uri %s done." % (cururi))
knownuris[cururi] = None