Last active
January 15, 2018 02:04
-
-
Save boazsender/6628205d677685078b7ca8fdfe6e3040 to your computer and use it in GitHub Desktop.
The first of each identity in the VGG face dataset.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Author: Boaz Sender | |
# | |
# Script for downloading all the good images in the VGG data set. | |
# | |
# VGG is available for download from www.robots.ox.ac.uk/~vgg/data/vgg_face/ | |
# | |
# usage: from the root of a vgg_face_dataset download, run: | |
# mkdir full_scrape && ./vgg_full_scrape.sh | |
# Loop over all the files in the vgg data set that we downloaded | |
for file in `ls -1 files` | |
do | |
# Figure out the name of the person who we are downloading an image for | |
# from the file name | |
identity=$(echo $file | rev | cut -c5- | rev) | |
declare -i count=1 | |
# make a file to log all the dead images in | |
touch dead_images.txt | |
function scrape_image() | |
{ | |
# download the image | |
wget $1 --output-document=scrape_full/$2 --timeout=6 --tries 2; | |
# If the download creates a recognizeable image, then figure out the format, | |
# give it the correct extension, and move on | |
if file "scrape_full/$2" | grep "image"; then | |
format=$(file -0 -F" " "scrape_full/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]') | |
mv "scrape_full/$2" "scrape_full/$2.$format" | |
echo "******************************************************************" | |
echo "Images successful! for ${identity} at count number ${count}!" | |
echo "******************************************************************" | |
# Otherwise, if we got an unrecognizeable image delete that garbo | |
# and log that is was a bad image | |
else | |
echo "******************************************************************" | |
echo "Images download failed for ${identity} at count number ${count}! Recording and moving on" | |
echo "******************************************************************" | |
echo $identity $2 >> dead_images.txt | |
rm scrape_full/$2 | |
fi | |
# Increment the counter, and call this function again with the next line | |
((count++)) | |
arg=$count | |
arg+="p" | |
nextarr=(`sed -n $arg 'files/'$file`) | |
scrape_image ${nextarr[1]} ${identity}_${nextarr[0]} | |
} | |
# Kickoff this shindig with the first line of the current file | |
arr=(`sed -n 1p 'files/'$file`) | |
scrape_image ${arr[1]} ${identity}_${arr[0]} | |
done; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# usage: from the root of a vgg_face_dataset download run | |
# ./vgg_process_files.sh > vgg_face_dataset_first_image_per_identity.txt | |
for i in `ls -1 files` | |
do | |
head 'files/'$i -n1 -v | |
done; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Author: Boaz Sender | |
# | |
# Script for processing the data in vgg_face_dataset and downloading a single | |
# image for each identity (person) in the data. | |
# | |
# The VGG data is available for download from | |
# www.robots.ox.ac.uk/~vgg/data/vgg_face/ | |
# | |
# At the time of this writing, about 12% of the first images of each identity | |
# are no longer available, or borken in some way. This script tries | |
# downloading until it finds a good image. | |
# | |
# usage: from the root of a vgg_face_dataset download, run: | |
# mkdir scrape && ./vgg_process_files_and_scrape.sh | |
# Define our scraper function | |
function scrape_image() | |
{ | |
# download the image, and if it downloads and creates a recognizeable image, | |
# then figure out the format, give it the correct extension, and move on | |
if wget $1 --output-document=scrape/$2 -q --timeout=6 --tries 2 && file "scrape/$2" | grep "image" && (( $(identify -format %n scrape/$2) < 2)); then | |
format=$(file -0 -F" " "scrape/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]') | |
mv "scrape/$2" "scrape/$2.$format" | |
echo "******************************************************************" | |
echo "Images successful! for ${identity} at count number ${count}!" | |
echo "******************************************************************" | |
# otherwise, if the wget fails, or downloads and empty file, or something | |
# that is not an image delete that garbo, and try again with the next line | |
# of the identity's file | |
else | |
((count++)) | |
echo "******************************************************************" | |
echo "Image failed, moving on to image number ${count} for ${identity}" | |
echo "******************************************************************" | |
rm scrape/$2 | |
arg=$count | |
arg+="p" | |
nextarr=(`sed -n $arg 'files/'$file`) | |
scrape_image ${nextarr[1]} ${identity}_${nextarr[0]} | |
fi | |
} | |
# Loop over all the files in the vgg data set that we downloaded | |
for file in `ls -1 files` | |
do | |
# Figure out the name of the person who we are downloading an image for | |
# from the file name | |
identity=$(echo $file | rev | cut -c5- | rev) | |
declare -i count=1 | |
# Kickoff this shindig with the first line of the current file | |
arr=(`sed -n 1p 'files/'$file`) | |
scrape_image ${arr[1]} ${identity}_${arr[0]} | |
done; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# usage: from the root of a vgg_face_dataset download run | |
# mkdir scrape && ./vgg_scrape.sh | |
while read p; do | |
arr=(`echo ${p}`) | |
wget ${arr[2]} --output-document=scrape/${arr[0]}.jpg --timeout=6 --tries 2 | |
done < vgg_face_dataset_first_image_per_identity.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great point, updated!