Last active
August 27, 2020 22:34
-
-
Save nsapa/078e12acf5648fde11efbe8fd707e2ea to your computer and use it in GitHub Desktop.
Patch on WikiTeam/wikiteam for grabbing https://wiki.dystify.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/dumpgenerator.py b/dumpgenerator.py | |
index 3193fe2..e09cd68 100755 | |
--- a/dumpgenerator.py | |
+++ b/dumpgenerator.py | |
@@ -497,8 +497,10 @@ def getUserAgent(): | |
""" Return a cool user-agent to hide Python user-agent """ | |
useragents = [ | |
# firefox | |
- 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', | |
- 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', | |
+ #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', | |
+ #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', | |
+ 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0', | |
+ 'Mozilla/5.0 (Windows NT 3.51) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' | |
] | |
return useragents[0] | |
@@ -574,6 +576,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): | |
except requests.exceptions.ConnectionError as e: | |
print ' Connection error: %s'%(str(e[0])) | |
xml = '' | |
+ except requests.exceptions.ReadTimeout as e: | |
+ print ' Read timeout: %s'%(str(e[0])) | |
+ xml = '' | |
c += 1 | |
return xml | |
@@ -1471,16 +1476,30 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): | |
print 'Filename is too long, truncating. Now it is:', filename2 | |
filename3 = u'%s/%s' % (imagepath, filename2) | |
imagefile = open(filename3, 'wb') | |
- r = requests.get(url=url) | |
+ | |
+ # HACK HACK for wiki.dystify.com | |
+ url = url.replace('http:','https:') | |
+ | |
+ r = session.head(url=url, allow_redirects=False) | |
+ if r.is_redirect: | |
+ print 'Site is redirecting us to: ', r.url | |
+ url = r.url | |
+ print 'Final URL image', url | |
+ r = session.get(url=url, allow_redirects=False) | |
+ if re.search(r'Not Acceptable', r.content): | |
+ print 'Server refused to send us content' | |
+ sys.exit() | |
imagefile.write(r.content) | |
imagefile.close() | |
# saving description if any | |
try: | |
title = u'Image:%s' % (filename) | |
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): | |
+ print 'Using action=query to export title: ', title | |
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title) | |
xmlfiledesc = r.text | |
else: | |
+ print 'Using getXMLFileDesc() to export title: ', title | |
xmlfiledesc = getXMLFileDesc( | |
config=config, | |
title=title, | |
@@ -1494,7 +1513,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): | |
f = open('%s/%s.desc' % (imagepath, filename2), 'w') | |
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> | |
- if not re.search(r'</mediawiki>', xmlfiledesc): | |
+ if not re.search(r'</page>', xmlfiledesc): | |
# failure when retrieving desc? then save it as empty .desc | |
xmlfiledesc = '' | |
f.write(xmlfiledesc.encode('utf-8')) | |
@@ -1976,7 +1995,9 @@ def checkAPI(api=None, session=None): | |
def checkIndex(index=None, cookies=None, session=None): | |
""" Checking index.php availability """ | |
+ print 'Index URL: ', index | |
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30) | |
+ #print r.text | |
if r.status_code >= 400: | |
print("ERROR: The wiki returned status code HTTP {}".format(r.status_code)) | |
return False | |
diff --git a/launcher.py b/launcher.py | |
index 708635a..e7b4562 100644 | |
--- a/launcher.py | |
+++ b/launcher.py | |
@@ -76,15 +76,15 @@ def main(): | |
started = True | |
break #stop searching, dot not explore subdirectories | |
- # time.sleep(60) | |
+ time.sleep(60) | |
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms | |
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed; | |
# typically they don't provide any crawl-delay value in their robots.txt). | |
if started and wikidir: #then resume | |
print 'Resuming download, using directory', wikidir | |
- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False) | |
+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--delay=10', '--retries=10', '--path={}'.format(wikidir)], shell=False) | |
else: #download from scratch | |
- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False) | |
+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--delay=10', '--retries=10'], shell=False) | |
started = True | |
#save wikidir now | |
for f in os.listdir('.'): |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment