Last active
October 23, 2018 03:54
-
-
Save plutocrat/ecbefa95926e6de0edad6bcf3ea0bce4 to your computer and use it in GitHub Desktop.
Quick script to remove Web Crawlers / Bots and other non-traffic from web logs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## Set Variables | |
# Pass input file as a commandline argument, or set it here, and your output file | |
INFILE=$1 | |
OUTFILE=./$1.squish | |
TMPFILE=./squish.tmp | |
# Lists of patterns to delete from logfiles. space separated. backslash before special characters | |
# LOCALTRAFFIC will be unique to your setup. | |
LOCALTRAFFIC=" wp-cron.php 10.10.0.2 wp-login.php \/wp-admin\/ " | |
# List of known bots. Add more if you see them after looking at the OUTFILE with | |
# awk -F "\"" '{print $6}' xx.log.squish | sort | uniq -c | sort -nr | head -n 20 | |
BOTLIST="ahrefs Baiduspider bingbot Cliqzbot cs.daum.net DomainCrawler DuckDuckGo Exabot Googlebot linkdexbot magpie-crawler MJ12bot msnbot OpenLinkProfiler.org MegaIndex.ru opensiteexplorer pingdom rogerbot SemrushBot SeznamBot sogou.com\/docs tt-rss Wotbox YandexBot YandexImages ysearch\/slurp BLEXBot Flamingo_SearchEngine okhttp scalaj-http UptimeRobot YisouSpider proximic.com\/info\/spider " | |
## End of Variables | |
# Start, but give people a chance to opt out ... | |
read -p "Will copy $INFILE to $OUTFILE and perform all operations on the file copy. Press ENTER to proceed ..." | |
if [ -f $TMPFILE ] ; then | |
rm $TMPFILE | |
fi | |
cp $INFILE $OUTFILE | |
echo | |
echo "-------- Removing local traffic ---------" | |
for TERM in $LOCALTRAFFIC; do | |
TERMCOUNT=$( grep "$TERM" $OUTFILE | wc -l ) | |
echo $TERMCOUNT instances of $TERM removed >> $TMPFILE | |
sed -i "/$TERM/d" $OUTFILE | |
done | |
sort -nr $TMPFILE | |
rm $TMPFILE | |
echo | |
echo "------- Removing Bots ---------" | |
for TERM in $BOTLIST; do | |
TERMCOUNT=$( grep "$TERM" $OUTFILE | wc -l ) | |
echo $TERMCOUNT instances of $TERM removed >> $TMPFILE | |
sed -i "/$TERM/d" $OUTFILE | |
done | |
sort -nr $TMPFILE | |
rm $TMPFILE | |
echo | |
echo "======Summary=======" | |
#filestats before | |
PRELINES=$(cat $INFILE | wc -l ) | |
PRESIZE=$( stat -c %s $INFILE ) | |
#filestats after | |
POSTLINES=$(cat $OUTFILE | wc -l ) | |
POSTSIZE=$( stat -c %s $OUTFILE ) | |
PERCENT=$(awk "BEGIN { pc=100*${POSTLINES}/${PRELINES}; i=int(pc); print (pc-i<0.5)?i:i+1 }") | |
echo Original file $INFILE is $PRESIZE bytes and contains $PRELINES lines | |
echo Processed file $OUTFILE is $POSTSIZE bytes and contains $POSTLINES lines | |
echo Log reduced to $PERCENT percent of its original size. | |
echo Original file was untouched. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment