Last active
August 29, 2015 14:04
-
-
Save bajanReece/f004870e3dfdb94f4e8c to your computer and use it in GitHub Desktop.
A bash script to download entire web paths using wget and sed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# First up let's create the save path if it doesn't exist | |
LOCAL_PATH=~ | |
SAVE_PATH="$LOCAL_PATH/downloaded_sites" | |
# Next we need to sanitize our input. We want the user to be able to add | |
# naturally formated URLs, this means we need to be able to accept: | |
# http://www.somedomain.com | |
# www.somedomain.com | |
# somedomain.com | |
# If the user does not supply www. we should not assume www. we should accept the | |
# domain as passed, thus: | |
# acme.somedomain.com | |
# t.somedomain.com | |
# ww2.somedomain.com | |
# are accepted | |
# We should also expect that the user can supply paths to resources beneath the domain path | |
# http://www.somedomain.com/a/deep/resource/ | |
# http://c.somedomain.com/another/resource/thats/deep/ | |
# We must remove any resrouce request that ends in a file pointer, so the following will have their | |
# resrouce pointer stripped: | |
# http://www.somedomain.com/index.html | |
# somedomain.com/my/new/things/page.php | |
# | |
# 1. We will stript http:// OR https:// from the supplied string | |
# we need to echo the output so that we can pass it to the SUPPLIED_PATH variable | |
# You'll notice we're using '|' as our sed delimeter and not '/', this is because | |
# we're testing for a string that contains forward slashes and we don't want to have to | |
# escape the string e.g. /http:\/\/// | |
# Would love to use the I flag, but Mac OS Mavericks doesn't support it | |
# e.g. s|http[s]*://||I, throws an error | |
DOWNLOAD_PATH="" | |
SUPPLIED_PATH="$(echo $1 | sed 's|http[s]*://||')" | |
# We need the path split into an array for a few things | |
# 1. If the user has passed an address containing a resource | |
# 2. To extract the Top Level Domain | |
# We use the internal field separator (IFS) to convert the supplied string into | |
# an array using the '/' as the delimeter for the string | |
IFS='/' read -ra PATH_ARR <<< "$SUPPLIED_PATH" | |
PATH_ARR_LEN=${#PATH_ARR[@]} | |
ALLOWED_DOMAIN=${PATH_ARR[0]} | |
# 2. We will check the last character, if it's not a forward slash | |
LAST_PATH_CHAR="${SUPPLIED_PATH: -1}" | |
# ... we'll remove everything up to the last forward slash | |
if [ "$LAST_PATH_CHAR" != "/" ] | |
then | |
for ((i=0;i<PATH_ARR_LEN-1; i++)); | |
do | |
DOWNLOAD_PATH+="${PATH_ARR[$i]}/" | |
done | |
else | |
DOWNLOAD_PATH+=$SUPPLIED_PATH | |
fi | |
mkdir -p $SAVE_PATH | |
wget --directory-prefix=$SAVE_PATH --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains $ALLOWED_DOMAIN --no-parent $DOWNLOAD_PATH | |
exit 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment