bajanReece · August 29, 2015 14:04
diff --git a/Site downloader b/Site downloader
 #!/bin/bash
 # First up let's create the save path if it doesn't exist
 LOCAL_PATH=~
 SAVE_PATH="$LOCAL_PATH/downloaded_sites"
 # Next we need to sanitize our input. We want the user to be able to add
 # naturally formated URLs, this means we need to be able to accept:
 #   http://www.somedomain.com
 #   www.somedomain.com
 #   somedomain.com
 # If the user does not supply www. we should not assume www. we should accept the
 # domain as passed, thus:
 #   acme.somedomain.com
 #   t.somedomain.com
 #   ww2.somedomain.com
 # are accepted
 # We should also expect that the user can supply paths to resources beneath the domain path
 #   http://www.somedomain.com/a/deep/resource/
 #   http://c.somedomain.com/another/resource/thats/deep/
 # We must remove any resrouce request that ends in a file pointer, so the following will have their
 # resrouce pointer stripped:
 #   http://www.somedomain.com/index.html
 #   somedomain.com/my/new/things/page.php
 #
 # 1. We will stript http:// OR https:// from the supplied string
 #  we need to echo the output so that we can pass it to the SUPPLIED_PATH variable
 #  You'll notice we're using '|' as our sed delimeter and not '/', this is because
 #  we're testing for a string that contains forward slashes and we don't want to have to
 #  escape the string e.g. /http:\/\///
 #  Would love to use the I flag, but Mac OS Mavericks doesn't support it
 #  e.g. s|http[s]*://||I, throws an error
 DOWNLOAD_PATH=""
 SUPPLIED_PATH="$(echo $1 | sed  's|http[s]*://||')"
 # We need the path split into an array for a few things
 # 1. If the user has passed an address containing a resource
 # 2. To extract the Top Level Domain
 # We use the internal field separator (IFS) to convert the supplied string into
 # an array using the '/' as the delimeter for the string
 IFS='/' read -ra PATH_ARR <<< "$SUPPLIED_PATH"
 PATH_ARR_LEN=${#PATH_ARR[@]}
 ALLOWED_DOMAIN=${PATH_ARR[0]}
 #  2. We will check the last character, if it's not a forward slash
 LAST_PATH_CHAR="${SUPPLIED_PATH: -1}"
 #  ... we'll remove everything up to the last forward slash
 if [ "$LAST_PATH_CHAR" != "/" ]
  then
    for ((i=0;i<PATH_ARR_LEN-1; i++));
    do
    DOWNLOAD_PATH+="${PATH_ARR[$i]}/"
    done
  else
    DOWNLOAD_PATH+=$SUPPLIED_PATH
  fi

 mkdir -p $SAVE_PATH
 wget --directory-prefix=$SAVE_PATH --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains $ALLOWED_DOMAIN --no-parent $DOWNLOAD_PATH
 exit 1
	#!/bin/bash
	# First up let's create the save path if it doesn't exist
	LOCAL_PATH=~
	SAVE_PATH="$LOCAL_PATH/downloaded_sites"
	# Next we need to sanitize our input. We want the user to be able to add
	# naturally formated URLs, this means we need to be able to accept:
	# http://www.somedomain.com
	# www.somedomain.com
	# somedomain.com
	# If the user does not supply www. we should not assume www. we should accept the
	# domain as passed, thus:
	# acme.somedomain.com
	# t.somedomain.com
	# ww2.somedomain.com
	# are accepted
	# We should also expect that the user can supply paths to resources beneath the domain path
	# http://www.somedomain.com/a/deep/resource/
	# http://c.somedomain.com/another/resource/thats/deep/
	# We must remove any resrouce request that ends in a file pointer, so the following will have their
	# resrouce pointer stripped:
	# http://www.somedomain.com/index.html
	# somedomain.com/my/new/things/page.php
	#
	# 1. We will stript http:// OR https:// from the supplied string
	# we need to echo the output so that we can pass it to the SUPPLIED_PATH variable
	# You'll notice we're using '\|' as our sed delimeter and not '/', this is because
	# we're testing for a string that contains forward slashes and we don't want to have to
	# escape the string e.g. /http:\/\///
	# Would love to use the I flag, but Mac OS Mavericks doesn't support it
	# e.g. s\|http[s]*://\|\|I, throws an error
	DOWNLOAD_PATH=""
	SUPPLIED_PATH="$(echo $1 \| sed 's\|http[s]*://\|\|')"
	# We need the path split into an array for a few things
	# 1. If the user has passed an address containing a resource
	# 2. To extract the Top Level Domain
	# We use the internal field separator (IFS) to convert the supplied string into
	# an array using the '/' as the delimeter for the string
	IFS='/' read -ra PATH_ARR <<< "$SUPPLIED_PATH"
	PATH_ARR_LEN=${#PATH_ARR[@]}
	ALLOWED_DOMAIN=${PATH_ARR[0]}
	# 2. We will check the last character, if it's not a forward slash
	LAST_PATH_CHAR="${SUPPLIED_PATH: -1}"
	# ... we'll remove everything up to the last forward slash
	if [ "$LAST_PATH_CHAR" != "/" ]
	then
	for ((i=0;i<PATH_ARR_LEN-1; i++));
	do
	DOWNLOAD_PATH+="${PATH_ARR[$i]}/"
	done
	else
	DOWNLOAD_PATH+=$SUPPLIED_PATH
	fi

	mkdir -p $SAVE_PATH
	wget --directory-prefix=$SAVE_PATH --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains $ALLOWED_DOMAIN --no-parent $DOWNLOAD_PATH
	exit 1