Last active
September 14, 2019 09:56
-
-
Save DollarAkshay/8c780736d826d59fc7e29861177fdd7c to your computer and use it in GitHub Desktop.
Multithreaded Image Scraper in Python (Insanely fast on Ryzen 7)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cv2 | |
import numpy as np | |
import urllib.request | |
import time | |
import threading | |
import math | |
def getPokemon(start, end): | |
print("Started worker for range :", start, "to", end) | |
for i in range(start, end): | |
try: | |
url = 'https://assets.pokemon.com/assets/cms2/img/pokedex/detail/' + \ | |
'{:03d}'.format(i) + '.png' | |
request = urllib.request.Request(url) | |
response = urllib.request.urlopen(request) | |
binary_str = response.read() | |
byte_array = bytearray(binary_str) | |
numpy_array = np.asarray(byte_array, dtype="uint8") | |
image = cv2.imdecode(numpy_array, cv2.IMREAD_UNCHANGED) | |
cv2.imwrite("images/" + '{:04d}'.format(i) + '.png', image) | |
print("Saved " + '{:04d}'.format(i) + '.png') | |
except Exception as e: | |
print(str(e)) | |
start_time = time.time() | |
thread_count = 16 | |
image_count = 801 | |
thread_list = [] | |
for i in range(thread_count): | |
start = math.floor(i * image_count / thread_count) + 1 | |
end = math.floor((i + 1) * image_count / thread_count) + 1 | |
thread_list.append(threading.Thread(target=getPokemon, args=(start, end))) | |
for thread in thread_list: | |
thread.start() | |
for thread in thread_list: | |
thread.join() | |
end_time = time.time() | |
print("Done") | |
print("Time taken : " + str(end_time - start_time) + "sec") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ateeqmughal266 I am saving them in a folder called images as you can see on line 21. You will have to create a folder called images before running this script. The folder should be in the same directory as the running script.