Last active
March 26, 2016 04:41
-
-
Save tjefferson/111186e28c5d7a46a486 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -o nounset | |
set -o errexit | |
IFS=$'\n' | |
dt=$(date -d'yesterday' '+%Y%m%d') #每天读取前一天的日志 | |
logfile="/var/log/nginx/access.log-${dt}.gz" #nginx access.log日志 | |
#读取日志并将User Agent信息中包含爬虫标识的过滤出来进行解析操作 | |
zcat ${logfile} | awk -F'"' 'tolower($6)~/bot|spider|yahoo! slurp/ && $2~/GET/ {print $0}' | \ | |
while read line | |
do | |
user_agent=$(echo ${line} | awk -F'"' '{print tolower($6)}') | |
spider_name='unknown' | |
case ${user_agent} in | |
*google*mobile*|*mobile*google*) | |
spider_name='googlebot-mobile';; | |
*googlebot-image*) | |
spider_name='googlebot-image';; | |
*googlebot*) | |
spider_name='googlebot-pc';; | |
*mobile*baiduspider*) | |
spider_name='baiduspider-mobile';; | |
*baiduspider*) | |
spider_name='baiduspider-pc';; | |
*360spider*|*haosouspider*) | |
spider_name='360spider';; | |
*mobile*bingbot*) | |
spider_name='bingbot-mobile';; | |
*bingbot*) | |
spider_name='bingbot';; | |
*yahoo*) | |
spider_name='yahoobot';; | |
*yandexbot*) | |
spider_name='yandexbot';; | |
*sogou\ web\ spider*) | |
spider_name='sogoubot';; | |
esac | |
access_date=$(date -d ${dt} +'%Y-%m-%d') #爬取日期即为处理的access.log文件名中的日期 | |
origin_hour=$(echo ${line} | grep -oE '([01][1-9]|10|3[01])\/[a-zA-Z]{3}\/20[1-9]{2}:([01][0-9]|2[0-3])' | awk -F':' '{print $2}') | |
format_hour=$(echo ${origin_hour} | sed -r 's/^0([0-9])/\1/') | |
access_hour=$(((format_hour + 8) % 24)) #转换成北京时间的小时 | |
page_url=$(echo ${line} | awk -F'"' '{print $2}' | awk -F' ' '{print $2}') | |
#爬取的页面只保留前255个字符 | |
if [ ${#page_url} -gt 255 ] | |
then | |
page_url=$(echo ${page_url} | cut -c1-255) | |
fi | |
http_status=$(echo ${line} | awk -F'"' '{print $3}' | awk -F' ' '{print $1}') | |
#存入数据库 | |
echo "INSERT INTO blog_spider_test (spider_name,date,hour,page_url,http_status) VALUES('$spider_name','$access_date','$access_hour','$page_url','$http_status');" | mysql test | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
发现有client会伪装成googlebot,遂将进入爬虫信息统计的日志条件设置更为严格,增加:
&& $2~/GET/
仅筛选GET请求的条件