Created
February 1, 2017 17:57
-
-
Save doubleirish/b8af8e5fae83a281125b81f8c981801d to your computer and use it in GitHub Desktop.
bash script to automatically Log in to target page , extracts all relative urls in target page, crawl through each child page , scrape a html table and aggregate it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export OS_USERNAME="yourUser" | |
export OS_PASSWORD="yourPass" | |
# logs into protected page (your formdata params may be different), finds relative urls, converts them to full urls and stores them in a local file | |
wget -O- --post-data "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}" https://www.example.com/display/IT/Release+Management \ | |
| grep href \ | |
| sed -n 's/.*href="\([^"]*\).*/https\:\/\/www\.example\.com\1/p' \ | |
| sort | uniq > urls.txt | |
# iterates through file of urls and aggrregates screen scrapes of selected xpath elements in each url into a single summary files | |
echo "<html><head><title>summary</title></head><body>" >summary.html; | |
while read url; do | |
echo "parsing $url ..."; | |
echo "<h2> $url </h2> <table border='1'> " >>summary.html; | |
# download each url, auto logs into protected sourc | |
wget -O- --post-data "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}" $url > page.html ; | |
# uses xpath on xmllint to extract a target element e.g the contents of a table after a specific h2 entry | |
xmllint --html --htmlout --recover --nowarning --format --xpath "//h2[text()='Production Issues Encountered']/following::table/node()" page.html 2>/dev/null >>summary.html; | |
echo "</table> " >>summary.html; | |
done < urls.txt | |
echo "</body></html>" >>summary.html; | |
sed -i 's/[^[:print:]]//g' summary.html; # remove unprintables |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment