Python爬虫实战(三):定时爬取数据存入SqlServer
目錄
- 🌹前言
- 爬取目標(biāo)(效果展示)
- 準(zhǔn)備工作
- 代碼分析
- 第一步
- 第二步
- 第三步
- 第四步
- 完整代碼
- 啟動(dòng)
🌹前言
-
🏆🏆作者介紹:Python領(lǐng)域優(yōu)質(zhì)創(chuàng)作者、華為云享專家、阿里云專家博主、2021年CSDN博客新星Top6
- 🔥🔥本文已收錄于Python爬蟲實(shí)戰(zhàn)100例專欄:《Python爬蟲實(shí)戰(zhàn)100例》
- 📝?📝?此專欄文章是專門針對(duì)Python爬蟲實(shí)戰(zhàn)案例從基礎(chǔ)爬蟲到進(jìn)階爬蟲,歡迎免費(fèi)訂閱
爬取目標(biāo)(效果展示)
效果展示:
爬取的內(nèi)容是:標(biāo)題、榜單、熱度值、新聞?lì)愋汀r(shí)間戳、url地址等
準(zhǔn)備工作
建表:
CREATE TABLE "WB_HotList" ("id" INT IDENTITY(1,1) PRIMARY key,"batch" NVARCHAR(MAX),"daydate" SMALLDATETIME,"star_word" NVARCHAR(MAX),"title" NVARCHAR(MAX),"category" NVARCHAR(MAX),"num" NVARCHAR(MAX),"subject_querys" NVARCHAR(MAX),"flag" NVARCHAR(MAX),"icon_desc" NVARCHAR(MAX),"raw_hot" NVARCHAR(MAX),"mid" NVARCHAR(MAX),"emoticon" NVARCHAR(MAX),"icon_desc_color" NVARCHAR(MAX),"realpos" NVARCHAR(MAX),"onboard_time" SMALLDATETIME,"topic_flag" NVARCHAR(MAX),"ad_info" NVARCHAR(MAX),"fun_word" NVARCHAR(MAX),"note" NVARCHAR(MAX),"rank" NVARCHAR(MAX),"url" NVARCHAR(MAX) )為防止,字段給的不夠,直接給個(gè)MAX!
代碼分析
第一步
發(fā)送請(qǐng)求,獲取網(wǎng)頁(yè)信息
提供了數(shù)據(jù)的接口,所以我們直接訪問(wèn)接口就行,如下圖(json格式):
# 接口地址:https://weibo.com/ajax/statuses/hot_band def __init__(self) :self.url = "https://weibo.com/ajax/statuses/hot_band"self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"} # 發(fā)送請(qǐng)求,獲取相應(yīng) def parse_url(self):response = requests.get(self.url,headers=self.headers)time.sleep(2) # 休息兩秒return response.content.decode()第二步
解析數(shù)據(jù),提取我們所需要的數(shù)據(jù)
接口中的數(shù)據(jù)格式化如下(只需提取我們所需要的):
第三步
數(shù)據(jù)庫(kù)的batch用于判斷,每次插入的批次(50個(gè)一批),如果爬蟲斷了,寫個(gè)方法還能接著上次的批次
如圖:
第四步
把數(shù)據(jù)存入數(shù)據(jù)庫(kù)
# 連接數(shù)據(jù)庫(kù)服務(wù),創(chuàng)建游標(biāo)對(duì)象 db = pymssql.connect('.', 'sa', 'yuan427', 'test') #服務(wù)器名,賬戶,密碼,數(shù)據(jù)庫(kù)名 if db:print("連接成功!") cursor= db.cursor()try:# 插入sql語(yǔ)句sql = "insert into test4(batch,daydate,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time, \topic_flag,ad_info,fun_word,note,rank,url) values (%s,getdate(),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"# 執(zhí)行插入操作cursor.execute(sql,(batch,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time,topic_flag,ad_info, \fun_word,note,rank,url))db.commit()print('成功載入......' )except Exception as e:db.rollback()print(str(e))# 關(guān)閉游標(biāo),斷開數(shù)據(jù)庫(kù) cursor.close() db.close()完整代碼
import requests,pymssql,time,json,re,datetime from threading import Timerclass Spider:def __init__(self) :self.url = "https://weibo.com/ajax/statuses/hot_band"self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}# 發(fā)送請(qǐng)求,獲取相應(yīng) def parse_url(self):response = requests.get(self.url,headers=self.headers)time.sleep(2)return response.content.decode()# 解析數(shù)據(jù),入庫(kù)def parse_data(self,data,a):json_data = json.loads(data)# 連接數(shù)據(jù)庫(kù)服務(wù),創(chuàng)建游標(biāo)對(duì)象db = pymssql.connect('.', 'sa', 'yuan427', 'test') #服務(wù)器名,賬戶,密碼,數(shù)據(jù)庫(kù)名 cursor= db.cursor()for i in range(50):ban_list = json_data['data']['band_list'][i]batch = f'第{a}批'try:star_word = ban_list['star_word']except Exception as e:print(e)try:title = ban_list['word']except Exception as e:print(e)try:category = ban_list['category']except Exception as e:print(e)try:num = ban_list['num']except Exception as e:print(e)try:subject_querys = ban_list['subject_querys']except Exception as e:print(e)try:flag = ban_list['flag']except Exception as e:print(e)try:icon_desc = ban_list['icon_desc']except Exception as e:print(e) try:raw_hot = ban_list['raw_hot']except Exception as e:print(e) try:mid = ban_list['mid']except Exception as e:print(e) try:emoticon = ban_list['emoticon']except Exception as e:print(e)try:icon_desc_color = ban_list['icon_desc_color']except Exception as e:print(e)try:realpos = ban_list['realpos']except Exception as e:print(e)try:onboard_time = ban_list['onboard_time']onboard_time = datetime.datetime.fromtimestamp(onboard_time)except Exception as e:print(e)try:topic_flag = ban_list['topic_flag']except Exception as e:print(e)try:ad_info = ban_list['ad_info']except Exception as e:print(e)try:fun_word = ban_list['fun_word']except Exception as e:print(e) try:note = ban_list['note']except Exception as e:print(e) try:rank = ban_list['rank'] + 1except Exception as e:print(e) try:url = json_data['data']['band_list'][i]['mblog']['text']url = re.findall('href="(.*?)"',url)[0]except Exception as e:print(e)try:# 插入sql語(yǔ)句sql = "insert into test4(batch,daydate,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time, \topic_flag,ad_info,fun_word,note,rank,url) values (%s,getdate(),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"# 執(zhí)行插入操作cursor.execute(sql,(batch,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time,topic_flag,ad_info, \fun_word,note,rank,url))db.commit()print('成功載入......' )except Exception as e:db.rollback()print(str(e))# 關(guān)閉游標(biāo),斷開數(shù)據(jù)庫(kù)cursor.close()db.close()# 把數(shù)據(jù)庫(kù)batch列存入列表并返回(用于判斷批次號(hào))def batch(self):conn=pymssql.connect('.', 'sa', 'yuan427', 'test')cursor=conn.cursor()cursor.execute("select batch from WB_HotList") #向數(shù)據(jù)庫(kù)發(fā)送SQL命令rows=cursor.fetchall()batchlist=[]for list in rows:batchlist.append(list[0]) return batchlist # 實(shí)現(xiàn)主要邏輯 def run(self, a):# 根據(jù)數(shù)據(jù)庫(kù)批次號(hào)給定a的值batchlist = self.batch()if len(batchlist) != 0:batch = batchlist[len(batchlist) -1]a = re.findall('第(.*?)批',batch)a = int(a[0]) + 1data = self.parse_url()self.parse_data(data,a)a +=1# 定時(shí)調(diào)用t = Timer(1800, self.run, (a, )) # 1800表示1800秒,半小時(shí)調(diào)用一次t.start()if __name__ == "__main__": spider = Spider()spider.run(1)啟動(dòng)
因?yàn)樾枰恢边\(yùn)行,所以就在 cmd 掛著
運(yùn)行成功后,去數(shù)據(jù)庫(kù)看看:
O了O了!!!
有講的不對(duì)的地方,希望各位大佬指正!!!,如果有不明白的地方評(píng)論區(qū)留言回復(fù)!兄弟們來(lái)個(gè)點(diǎn)贊有空就更新爬蟲實(shí)戰(zhàn)!!!
總結(jié)
以上是生活随笔為你收集整理的Python爬虫实战(三):定时爬取数据存入SqlServer的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: qt中opengl窗口的创建
- 下一篇: qpython3 安装库_qpython