一个简单的防爬虫脚本(转载欧彬)
因?yàn)楣居袝r(shí)候受爬蟲(chóng)的影響,有時(shí)候應(yīng)用壓力很高,所以決定封掉大部分爬蟲(chóng)。
1.檢測(cè)user-agent
2.設(shè)定一個(gè)閥值,如果超過(guò)這個(gè)訪問(wèn)閥值,就進(jìn)入灰名單,某個(gè)時(shí)間段聯(lián)系兩次進(jìn)入灰名單,就干掉這個(gè)ip
3.檢測(cè)開(kāi)發(fā)提供的特殊連點(diǎn),查過(guò)閥值并訪問(wèn)特殊連點(diǎn),也限制它。
4.判斷reffer,如果為空的鏈接記錄數(shù)大于整體訪問(wèn)的某個(gè)閥值,也限制該IP
#!/bin/bash
# 防爬蟲(chóng)腳本 by Sky Bin 2009.12
# description: Disable Spider
PATH="/var/PROGRAM/MANAGEMENT/modules/xbash:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:"
LANG=zh_CN
export PATH LANG
PATTERN_MIN=30
PORT=8080 #暫時(shí)不需要
MROOT="/data/resin-pro-3.1.6"
ACCESS_LOG=$MROOT"/logs/access.log"
IPTABLE="/tmp/.iptablestore"
LIMITIPLOG=$MROOT"/logs/limitIP.log"
CRITICAL=30 #系統(tǒng)并發(fā)數(shù)的閥值 當(dāng)系統(tǒng)的連接數(shù)超過(guò)該值才運(yùn)行腳本
PY_ACCESS="/var/PROGRAM/MANAGEMENT/modules/xbash/access.py"
SORT_D="/tmp/.sortd" #臨時(shí)放置排序的選出來(lái)的段
SORT_S="/tmp/.sorts" #臨時(shí)放置排序的選出來(lái)的IP
cd $MROOT
now_time_hour=$(date +%H)
now_time_min=$(date +%M)
case $now_time_hour in
? 01|02|03|04|05|06|07|08|09)
? now_time_hour=`echo $now_time_hour |? awk ' BEGIN { FS="0"} {print $2}'`
? ;;
? 00)
? now_time_min=0
? ;;
esac
case $now_time_min in
? 01|02|03|04|05|06|07|08|09)
? now_time_min=`echo $now_time_min |? awk ' BEGIN { FS="0"} {print $2}'`
? ;;
? 00)
? now_time_min=0
? ;;
esac
now_year=$(date +%Y)
now_month=$(date +%m)
now_day=$(date +%d)
if [[ $now_time_min -ge 0 && $now_time_min -le 20 && $now_time_hour -eq 8 && -e $IPTABLE ]] ; then
??
?????
?? mv $MROOT/logs/limitIP.log $MROOT/logs/limitIP.log.$now_year$now_month$now_day
?? if [[ "$now_day" == "15" ]] ; then
??
?????? tar -cvf $MROOT/logs/limitIP.log.tar $MROOT/logs/limitIP.log.[0-9][0-9]*
?????? rm? -f $MROOT/logs/limitIP.log.[0-9][0-9]*
?? fi
?? echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" >> $LIMITIPLOG
?? echo "復(fù)制iptables" >> $LIMITIPLOG
??? iptables-restore < $IPTABLE
? echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" >> $LIMITIPLOG
? rm -f $IPTABLE
? exit
fi
if [[ $now_time_min -ge 0? && $now_time_min -le 50 && $now_time_hour -eq 2 ]] ; then
? iptables-save > $IPTABLE
? echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" >> $LIMITIPLOG
? echo "已備份iptables" >> $LIMITIPLOG
? echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" >> $LIMITIPLOG
? iptables -F
? exit
fi
connect=$(netstat -st | awk '/connections established/{print $1}')
[[ $connect -lt $CRITICAL ]] && exit? #如果連接數(shù)少于閥值就退出
BLACK=/tmp/.black #存儲(chǔ)符合黑名單的IP和IP段
GRAY=/tmp/.gray? #灰名單
LIST=/tmp/.list? #存儲(chǔ)符合相應(yīng)條件的日志信息
LISTT=/tmp/.listt #臨時(shí)存儲(chǔ)變量,存儲(chǔ)不是ip段的IP
LOGPID=/tmp/.sclogpid #存儲(chǔ)灰名單的版本
BFB=100 #百分比的分母
FZ=2??? #百分比的分子
TOTALLINE=15000 #選取的總數(shù)
#臨時(shí)存放灰名單的IP
patternGrayIP=""
pattern_year=$(date +%Y)
pattern_hour=$(date +%k)
now_patter_hour=0
min=`echo $PATTERN_MIN | awk ' BEGIN { FS="0"} {print $1}'`
i=0
head_min=`date +%M | awk '{print substr($1,1,1)}'`
real_min=$head_min
pattern_tap=0
while [ $i -lt $min ]
do
?? if ` real_min -eq 0 ` ; then
????
????? if [[ "$pattern_hour" == " 0" ]] ; then
???????
?????? pattern_hour=24
????? fi
????? pattern_hour=$(($pattern_hour-1))
????? head_min=6
????? pattern_tap=1
?? fi
??? if [[ $pattern_tap -eq 0 ]] ; then
??????
?????? real_min=$(($head_min-$i-1))
???? else
?????? real_min=$(($head_min+$i-$min))???
??? fi???
?
???? case $pattern_hour in
??????? " 0"|" 1"|" 2"|" 3"|" 4"|" 5"|" 6"|" 7"|" 8"|" 9")
??????? now_pattern_hour=`echo $pattern_hour | awk '{print substr($1,1,1)}'`
??????? now_pattern_hour="0$now_pattern_hour"
??????? ;;
??????? 0|1|2|3|4|5|6|7|8|9)
??????? now_pattern_hour="0$pattern_hour"
??????? ;;
??????? *)
??????? now_pattern_hour="$pattern_hour"
??????? ;;
??????? esac??
??? if [[ $i -lt $(($min-1)) ]] ; then?
????? pattern_min="$pattern_min$pattern_year:$now_pattern_hour:$real_min|"
???? else
????? pattern_min="$pattern_min$pattern_year:$now_pattern_hour:$real_min"
??? fi
??? i=$(($i+1))??
done
time_pattern="$pattern_min"
############
# 主機(jī)信息 #
############
INTF=$(netstat -rn | tail -1 | awk '{print $NF}')
IP=$(ifconfig $INTF | awk '/inet addr/{ split($2,tmp,":") ; print tmp[2] }')
HOST_NAME=$(hostname --short)
###########
##########
#郵件環(huán)境#
##########
#CHARTSET="zh_CN."
MAIL_CLIENT=""
MAIL_SENDER=""
MAIL_SERVER=""
case $IP in
??????? 192.168.230.*|192.168.1[0-1].*|192.168.238.*|202.*)
??????? MAIL_SERVER=""
??????? ;;
??????? *)
??????? MAIL_SERVER=""
??????? ;;
esac
##########
LINECOUNT=0
rm -f $LIST
rm -f $LISTT
testline=`wc -l $ACCESS_LOG | awk '{print $1}'`
####以下是有時(shí)間匹配的樣本數(shù)據(jù)的獲取
#tail -$TOTALLINE $ACCESS_LOG | grep -E "$time_pattern" > $LIST
grep -E "$time_pattern" $ACCESS_LOG > $LIST
TOTALLINE=`wc -l $LIST | awk '{print $1}'`
if [[ -e $LIST ]]? ; then
?patternLINE=`wc -l $LIST | awk '{print $1}'`
?BFB=$(($patternLINE/$BFB))
?FZ=$(($FZ*$BFB))
?echo "##########################################################" >> $LIMITIPLOG
?date >> $LIMITIPLOG
?echo "匹配的時(shí)間模式是:"$time_pattern >> $LIMITIPLOG
?echo "Access_LOG的數(shù)據(jù)總數(shù)是:"$testline >> $LIMITIPLOG
?echo "獲取的數(shù)據(jù)的總數(shù)是:"$TOTALLINE >> $LIMITIPLOG
?echo "閥值是:"$FZ >> $LIMITIPLOG
else
?echo "##########################################################" >> $LIMITIPLOG
?date >> $LIMITIPLOG
?echo "-----------------------------------------------------------" >> $LIMITIPLOG
?echo "沒(méi)有獲取到樣本數(shù)據(jù)" >> $LIMITIPLOG
?exit
fi
#把a(bǔ)gent中含有爬蟲(chóng)類(lèi)代表的字符的先清除到黑名單中
cat $LIST | \
grep -iEv "googlebot|baiduspider" |grep -E "spider|bot|Yahoo|archiver|yodaoice" | awk '{print $1}' | sort | uniq? > $BLACK
#緩存通過(guò)第一個(gè)條件的log數(shù)據(jù)
cat $LIST | \
#選取不包含特殊字符的數(shù)據(jù)
grep? -Eiv "googlebot|archiver|spider|bot|Yahoo" | \
#選取例外IP之外的數(shù)據(jù)
grep -Ev "192.168.*.*|xx|127.0.0.1" > $LISTT
cat $LISTT > $LIST
rm -f $LISTT
if [[ -e $BLACK ]] ; then
spiderNum=`wc -l $BLACK | awk '{print $1}'`
if [[ $spiderNum -gt 0 ]] ; then
echo "-------------以下IP包含Spider等特殊字符--------------" >> $LIMITIPLOG
cat $BLACK >> $LIMITIPLOG?
limitIP_num=`wc -l $BLACK | awk '{print($1)}'`
cat $BLACK | sort | uniq -c > /tmp/.tmp_black
??? if [[ -e /tmp/.tmp_black ]] ; then
??????? #########發(fā)送郵件############
???????? env MAILRC=/dev/null charset="$CHARTSET" from="$MAIL_SENDER" smtp="$MAIL_SERVER" \
???????? nail -n -s "$HOST_NAME($IP)含有爬蟲(chóng)等關(guān)鍵字名單的IP" $MAIL_CLIENT < /tmp/.tmp_black
?? fi
fi
fi
cat $LIST | \
$PY_ACCESS G -gh | \
awk '{if( NR>2 && $1~/[0-9]+/&&"-"!~$2 && $3!="" )? print $1,$3}' | \
while read num ip
do
? if [[ $num -gt $FZ ]]
?? then
?????? echo "$ip"? >> $GRAY
?? else
????????? continue
?? fi
done
if [[ -e $GRAY ]] ; then
?tmp_gray_line=`wc -l $GRAY | awk '{print $1}'`
?if [[ $tmp_gray_line -gt 0 ]] ; then
? if [[ -e $LOGPID ]] ; then
?? last_hour=`cat $LOGPID | awk '{print $1}'`
??? if (( now_time_hour - last_now > 1 ))
????? then
????? cat $GRAY | sort | uniq -c | awk '{print($1,$2)}' | \
?????? while read num ip
?????? do
???????? if [[ $num -gt 1 ]]
?????????? then
?????????? echo "$ip"
???????? fi
?????? done > $LISTT
??????
???? if [[ -e $LISTT ]] ; then??
?????? logcount=`wc -l $LISTT | awk '{print($1)}'`
???
?????? if [[ $logcount -gt 0 ]] ; then
???????? #########發(fā)送郵件############
???????? env MAILRC=/dev/null charset="$CHARTSET" from="$MAIL_SENDER" smtp="$MAIL_SERVER" \
???????? nail -n -s "$HOST_NAME($IP)兩次連續(xù)進(jìn)入灰名單名單的IP" $MAIL_CLIENT < $LISTT
???????? echo "-------------以下字符是兩次連續(xù)進(jìn)入灰名單的------" >> $LIMITIPLOG
???????? cat $LISTT >> $LIMITIPLOG
???????? cat $LISTT > $GRAY
???????? ############封灰名單##################
??? ?cat $GRAY >> $BLACK
??? ?cat $GRAY >>? $MROOT/logs/gray.log ####暫時(shí)存放灰名單IP用于觀察
?????? fi
???? fi
??? fi???
? else
???? echo $now_time_hour > $LOGPID
? fi
?fi
fi
rm -f $LISTT
???? echo "檢查特殊文件的存在" >> $LIMITIPLOG
if [[ $# -eq 0 ]]
???? then
?????????? echo "沒(méi)有設(shè)置特殊連點(diǎn)文件,跳過(guò)特殊連點(diǎn)檢查" >> $LIMITIPLOG
???? else
??????????? graylinecount=`wc -l $GRAY | awk '{print $1}'`
??????????? i=0
??????? if [[ $graylinecount -gt 1 ]] ; then
??????????? cat $GRAY | awk '{print $1}' | \
??????????? while read ip
??????????? do
????????????? if [[ "$ip" ]] ; then
?????????????????????
??????????????? cat $1 | awk '{print($1)}' | \
??????????????? while read limitedUrl
????????????????? do
?????????????????? tap=`cat $LIST | access.py FG -gr -h $ip | grep -E "$limitedYrl" | awk '{if( NR>2 && $1~/[0-9]+/&&"-"!~$2 && $3!="" && $1>0? )? print $3}' | wc -l | awk '{print $1}'`
?????????????????? if ` tap -gt 0 ` ; then
????????????????????? echo $ip >> $LISTT
?????????????????? fi
?????????????????? done
???????????????
????????????? fi
??????????? done
??????????? if [[ -e $LISTT ]] ; then
?????????????? cat $LISTT | uniq -c | awk '{print($2)}' > /tmp/.gray_tsld
??????????? tsldEmailTap=`wc -l /tmp/.gray_tsld | awk '{print $1}'`
???????? if [[ $tsldEmailTap -gt 0 ]] ; then
???????????? #########發(fā)送郵件############
???????? env MAILRC=/dev/null charset="$CHARTSET" from="$MAIL_SENDER" smtp="$MAIL_SERVER" \
???????? nail -n -s "$HOST_NAME ($IP)爬蟲(chóng)檢查過(guò)程中在灰名單發(fā)現(xiàn)有特殊連接的IP" $MAIL_CLIENT < /tmp/.gray_tsld
??????????? echo "----------------爬蟲(chóng)檢查過(guò)程中在灰名單發(fā)現(xiàn)有特殊連接的IP-------------------" >> $LIMITIPLOG
??????????? cat /tmp/.gray_tsld >> $LIMITIPLOG
??????????? rm -f /tmp/.gray_tsld
????????? fi
???????? fi
???????? fi
fi
rm -f $LISTT
rm -f $LIST
cat $BLACK | awk ' BEGIN { FS="."} {printf "%d.%d.%d\n",$1,$2,$3}' | uniq -c | \
awk '{print($1,$2)}' | \
while read num ip
do
? if [[ $num -gt 1 ]]
? then
??? echo "$ip"
??? LINECOUNT=$(($LINECOUNT+$num))
? else
??? LINECOUNT=$(($LINECOUNT+$num))
??? head -n "$LINECOUNT" $BLACK | tail -1 >> $LIST
? fi
done > $LISTT
if [[ -e $LIST ]] ; then
cat $LIST | awk '{print($1)}' | \
# 封鎖判斷為異常的ip
while read ip
do
??? continuetap=`iptables -nL | grep $ip | wc -l | awk '{print $1}'`
??? if [[ $continuetap -gt 0 ]] ; then
?????? continue
??? fi
??????? diptap=`echo $ip | awk '{print($1".0/24")}'`
??????? dtap=`iptables -nL | grep $diptap | wc -l | awk '{print $1}'`
??????? if ` dtap -gt 0 ` ; then
????????? continue
??????? fi
???? #iptables -A INPUT -s $ip -p tcp --dport $PORT -j REJECT --reject-with tcp-reset
???? iptables -A INPUT -s $ip -p tcp -j DROP
done
fi
# 封鎖異常ip段
if [[ -e $LISTT ]]? ; then
cat $LISTT | awk '{print($1".0/24\n")}'? | \
while read ip
do
??? if [[ "$ip" ]] ; then
?????? continuetap=`iptables -nL | grep $ip | wc -l | awk '{print $1}'`
?????? if [[ $continuetap -gt 0 ]] ; then
?????? continue
?????? fi
????? #iptables -A INPUT -s $ip -p tcp --dport $PORT? -j REJECT --reject-with tcp-reset
????? iptables -A INPUT -s $ip -p tcp -j DROP
??? fi
done
fi
if [[ $now_time_hour -eq 12 ]] ; then
? ############排序iptables表###################
? #選出超過(guò)數(shù)量1從而作為網(wǎng)段處理
? iptables -nL | awk '/DROP/{print $4}' | awk -F"." '{print $1"."$2"."$3}' | sort | uniq -c | awk '{if($1>1) print $2}' > $SORT_D
? iptables -nL | awk '/DROP/{print $4}'? | \
? while read ip
? do
????? ltmp=`echo $ip | awk -F"." '{print $1"."$2"."$3}'`
???? if? cat $SORT_D | grep -q "$ltmp"
?????? then
?????? echo $ip > /dev/null
???? else
?????? echo $ip >> $SORT_S
???? fi
? done
iptables -F
?
if [[ -e $SORT_S ]] ; then
? cat $SORT_S | awk '{print($1)}' | \
? # 封鎖判斷為異常的ip
while read ip
do
???? #iptables -A INPUT -s $ip -p tcp --dport $PORT -j REJECT --reject-with tcp-reset
???? iptables -A INPUT -s $ip -p tcp -j DROP
done
fi
# 封鎖異常ip段
if [[ -e $SORT_D ]]? ; then
?? cat $SORT_D | awk '{print($1".0/24")}'? | \
while read ip
do
????? #iptables -A INPUT -s $ip -p tcp --dport $PORT? -j REJECT --reject-with tcp-reset
????? iptables -A INPUT -s $ip -p tcp -j DROP
done
fi
rm -f $SORT_D
rm -f $SORT_S
?
fi
轉(zhuǎn)載于:https://blog.51cto.com/liang3391/456520
總結(jié)
以上是生活随笔為你收集整理的一个简单的防爬虫脚本(转载欧彬)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 如何成为有思想、创新的程序员
- 下一篇: aspnet前后台条件下根目录的读取