网页爬虫python代码_Python 爬虫web网页版程序代码
一:網(wǎng)頁(yè)結(jié)構(gòu)分析
二:代碼實(shí)戰(zhàn)#! /usr/bin/env python2
# encoding=utf-8
#BeautifulSoup需要安裝 MySQLdb
import sys,os,re,hashlib
import urllib
import httplib2
from lxml import etree
import MySQLdb
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import time
reload(sys)
from datetime import datetime as dt,timedelta
import re
h=httplib2.Http(timeout=10)
#設(shè)置請(qǐng)求http頭 模擬偽裝 瀏覽器
headers={
'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
}
#正則匹配a標(biāo)簽
pattern = '(.*?)'
#日志記錄
log_path='./sporttery'
log_file='%s.log' % dt.now().strftime('%Y-%m-%d')
if not os.path.exists(log_path):
os.makedirs(log_path)
log=open('%s/%s' % (log_path,log_file),'w+')
#python操作mysql數(shù)據(jù)庫(kù)
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='root',
db ='test',
)
conn.set_character_set('utf8')
cur = conn.cursor()
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
cur.close()
#獲取請(qǐng)求鏈接內(nèi)容 失敗再次執(zhí)行
def download(url):
fails = 0
while True:
if fails>5:return None
try:
res,content = h.request(url,'GET',headers=headers)
return content.decode('utf-8','ignore')
except:
print(u'打開(kāi)鏈接失敗'+url)
fails +=1
#字符串截取方法
def GetMiddleStr(content,startStr,endStr):
startIndex = content.index(startStr)
if startIndex>=0:
startIndex += len(startStr)
endIndex = content.index(endStr)
return content[startIndex:endIndex]
def get_ul(data):
mystring=GetMiddleStr(data,'','')
return mystring
def test_sporttery(i):
url='http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_'+str(i)+'.html'
print url
#http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_2.html
source=download(url)
data=get_ul(source)
datas=data.split('
')
for each in datas:
ret=re.findall(r"(?<=href=\\").+?(?=\\")|(?<=href=\\').+?(?=\\')" ,each)
for urls in ret:
detial=download(urls)
if detial:
detial_content=GetMiddleStr(detial,'createFlashVideo','m3u8').replace(' ', '')
if detial_content:
end_url_rex=GetMiddleStr(detial_content+".m3u8",'http://','.m3u8')+"m3u8"
#最終的url
#title
sstree = etree.HTML(detial)
ssnodes = sstree.xpath('//*[@id="playVideo"]/div[1]/h2')
for ssn in ssnodes:
name= ssn.text.strip().replace('/h2>', '')
#title=GetMiddleStr(detial,'
').replace(' ', '')
#簡(jiǎn)介
introduction=GetMiddleStr(detial,'video-info">','').replace(' ', '')
dr = re.compile(r']+>',re.S)
introductions = dr.sub('',introduction)
end_content=introductions.strip().replace('/span>', '')
end_time= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()+8*60*60))
#end_times=dt.now().strftime('%Y-%m-%d %H:%i:%S')
saveDB(urls,end_url_rex,name,end_content,str(i),end_time)
def saveDB(current_url,end_url_rex,names,end_content,page,create_time):
#添加select update
sql = 'INSERT INTO test.mytables(current_url,end_url_rex,`names`,end_content,page,create_time)\\
VALUES (%s,%s,%s,%s,%s,%s)'
print sql
cur = conn.cursor()
cur.execute(sql,(current_url,end_url_rex,names,end_content,page,create_time))
cur.close()
conn.commit()
if __name__ == '__main__':
first="http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_1.html"
url = urllib2.urlopen(first)
content = url.read()
soup = BeautifulSoup(content)
strs=soup.findAll(attrs={"class":"pagination"})
lists=str(strs[0])
listss=re.findall(r'\\d+',lists)
count=len(listss)
list_string = list(set(listss))
str_num= list_string[-1]
i = 1
while i <= int(str_num):
test_sporttery(i)
i += 1
總結(jié)
以上是生活随笔為你收集整理的网页爬虫python代码_Python 爬虫web网页版程序代码的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: java gc日志乱码_6000+字,3
- 下一篇: python难度如何_入门Python学