Created
June 12, 2017 11:45
-
-
Save mrmartin/91fe4da82578c753a28a0ff533f2e9b9 to your computer and use it in GitHub Desktop.
Download wikimedia commons Google Art
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#download wikimedia commons Google Art - almost exclusively "flat" art: paintings, drawings, tapestries, ... | |
mkdir artworks | |
#two pages contain all top references | |
wget https://commons.wikimedia.org/wiki/Category:Google_Art_Project_works_by_collection?uselang=en-gb -O top_index.html | |
wget 'https://commons.wikimedia.org/w/index.php?title=Category:Google_Art_Project_works_by_collection&uselang=en-gb&subcatfrom=Norman+Rockwell+Museum%0AGoogle+Art+Project+works+in+Norman+Rockwell+Museum' -O top_index_2.html | |
cat top_index_2.html >> top_index.html | |
rm top_index_2.html | |
#gallery page urls | |
cat top_index.html | grep "CategoryTreeSection" | sed 's/.*href=\"\([^\"]*\)\">.*/https:\/\/commons.wikimedia.org\1/g' > galleries.list | |
#for each gallery | |
rm artworks.list | |
while read gallery; do | |
wget "${gallery}" -O gallery.html | |
cat gallery.html | grep "<div class=\"thumb\"" | sed 's/.*href=\"\([^\"]*\)\".*/https:\/\/commons.wikimedia.org\1/g' >> artworks.list | |
done <galleries.list | |
#for each artwork | |
while read artwork; do | |
wget "${artwork}" -O artwork.html | |
cat artwork.html | grep "Size of this preview:" | sed 's/.*preview: <a href=\"\([^\"]*\)".*/\1/g' > artwork | |
while read artwork_link; do | |
wget "${artwork_link}" --directory-prefix=artworks | |
done <artwork | |
sleep 1 | |
done <artworks.list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment