Last active
June 11, 2018 08:21
-
-
Save Skarlett/072c096f429e39137106b248ad1442b0 to your computer and use it in GitHub Desktop.
Extensive - configurable - Tumblr Scraper. (Python)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import time | |
def range_package(*args, **kwargs): | |
last_var = None | |
for i in range(*args, **kwargs): | |
if last_var and i: | |
yield last_var, i | |
last_var = i | |
class TumblrScraperDummy: | |
MAX_CHUNK_SIZE = 50 | |
HTTP = requests.Session | |
SLEEP_TIME = 0.2 | |
def __init__(self, name): | |
self.name = name | |
self.session = self.HTTP() | |
@property | |
def url(self): | |
return "http://{}.tumblr.com/api/read/json".format(self.name) | |
def best_size_media(self, jpkg, lookfor='photo-url-'): | |
last = 0 | |
for k in jpkg: | |
if k.startswith(lookfor): | |
size = int(k.split('-')[-1]) | |
if size > last: | |
last = size | |
return jpkg[lookfor+str(last)] | |
def get_json(self, start=0, num=50): | |
args = { | |
'start': start, | |
'num': num | |
} | |
r = self.session.get(self.url, params=args) | |
text = r.text.strip()[22:-1] | |
return json.loads(text) | |
def scrape(self): | |
results = set() | |
jpkg = self.get_json() | |
total_posts = jpkg['posts-total'] | |
for post in jpkg['posts']: | |
results.add(self.gather(post)) | |
for start, stop in range_package(0, total_posts, self.MAX_CHUNK_SIZE): | |
time.sleep(self.SLEEP_TIME) | |
for post in self.get_json(start, stop)['posts']: | |
results.add(self.gather(post)) | |
return results | |
def gather(self, post): | |
return post |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import tumblr_scraper_lib | |
class ProxyPortal(requests.Session): | |
# Lets make an easy way of using tor. | |
PROXY = { | |
'http': 'socks5://127.0.0.1:9050', | |
'https': 'socks5://127.0.0.1:9050' | |
} | |
def get(self, url, **kwargs): | |
if 'proxies' in kwargs: | |
kwargs.pop('proxies') | |
return requests.Session.get(self, url, proxies=self.PROXY, **kwargs) | |
class Tumblr(tumblr_scraper_lib.TumblrScraperDummy): | |
HTTP = ProxyPortal # Now all the HTTP get requests will go over the Tor Network. (I'd hate to get IP Banned.) | |
def gather(self, post): | |
# We can also define our gather definition to specify what we want. | |
if post['type'].lower() == 'photo': | |
return self.best_size_media(post) # returns photo urls in the best resolution. | |
if __name__ == '__main__': | |
# Pretty cool huh | |
results = [] | |
for name in ['my', 'giant', 'list'. 'of', 'user', 'names']: | |
user = Tumblr(name) | |
results.append(user.scrape()) | |
print(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment