Python爬虫实战糗事百科实例
生活随笔
收集整理的這篇文章主要介紹了
Python爬虫实战糗事百科实例
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
爬取糗事百科段子,假設頁面的URL是?http://www.qiushibaike.com/8hr/page/1
要求:
-
使用requests獲取頁面信息,用XPath / re 做數據提取
-
獲取每個帖子里的
用戶頭像鏈接、用戶姓名、段子內容、點贊次數和評論次數 -
保存到 json 文件內
參考代碼
#qiushibaike.py#import urllib
#import re
#import chardetimport requests
from lxml import etreepage = 1
url = 'http://www.qiushibaike.com/8hr/page/' + str(page)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}try:response = requests.get(url, headers=headers)resHtml = response.texthtml = etree.HTML(resHtml)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:item = {}imgUrl = site.xpath('./div/a/img/@src')[0].encode('utf-8')username = site.xpath('./div/a/@title')[0].encode('utf-8')#username = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip().encode('utf-8')# 投票次數vote = site.xpath('.//i')[0].text#print site.xpath('.//*[@class="number"]')[0].text# 評論信息comments = site.xpath('.//i')[1].textprint imgUrl, username, content, vote, commentsexcept Exception, e:print e
演示效果
?
多線程糗事百科案例
案例要求參考上面糗事百科單進程案例
Queue(隊列對象)
Queue是python中的標準庫,可以直接import Queue引用;隊列是線程間最常用的交換數據的形式
python下多線程的思考
對于資源,加鎖是個重要的環節。因為python原生的list,dict等,都是not thread safe的。而Queue,是線程安全的,因此在滿足使用條件下,建議使用隊列
-
初始化: class Queue.Queue(maxsize) FIFO 先進先出
-
包中的常用方法:
-
Queue.qsize() 返回隊列的大小
-
Queue.empty() 如果隊列為空,返回True,反之False
-
Queue.full() 如果隊列滿了,返回True,反之False
-
Queue.full 與 maxsize 大小對應
-
Queue.get([block[, timeout]])獲取隊列,timeout等待時間
-
-
創建一個“隊列”對象
- import Queue
- myqueue = Queue.Queue(maxsize = 10)
-
將一個值放入隊列中
- myqueue.put(10)
-
將一個值從隊列中取出
- myqueue.get()
多線程示意圖
?
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from Queue import Queue
import threading
import time
import jsonclass thread_crawl(threading.Thread):'''抓取線程類'''def __init__(self, threadID, q):threading.Thread.__init__(self)self.threadID = threadIDself.q = qdef run(self):print "Starting " + self.threadIDself.qiushi_spider()print "Exiting ", self.threadIDdef qiushi_spider(self):# page = 1while True:if self.q.empty():breakelse:page = self.q.get()print 'qiushi_spider=', self.threadID, ',page=', str(page)url = 'http://www.qiushibaike.com/8hr/page/' + str(page) + '/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}# 多次嘗試失敗結束、防止死循環timeout = 4while timeout > 0:timeout -= 1try:content = requests.get(url, headers=headers)data_queue.put(content.text)breakexcept Exception, e:print 'qiushi_spider', eif timeout < 0:print 'timeout', urlclass Thread_Parser(threading.Thread):'''頁面解析類;'''def __init__(self, threadID, queue, lock, f):threading.Thread.__init__(self)self.threadID = threadIDself.queue = queueself.lock = lockself.f = fdef run(self):print 'starting ', self.threadIDglobal total, exitFlag_Parserwhile not exitFlag_Parser:try:'''調用隊列對象的get()方法從隊頭刪除并返回一個項目。可選參數為block,默認為True。如果隊列為空且block為True,get()就使調用線程暫停,直至有項目可用。如果隊列為空且block為False,隊列將引發Empty異常。'''item = self.queue.get(False)if not item:passself.parse_data(item)self.queue.task_done()print 'Thread_Parser=', self.threadID, ',total=', totalexcept:passprint 'Exiting ', self.threadIDdef parse_data(self, item):'''解析網頁函數:param item: 網頁內容:return:'''global totaltry:html = etree.HTML(item)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:try:imgUrl = site.xpath('.//img/@src')[0]title = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip()vote = Nonecomments = Nonetry:vote = site.xpath('.//i')[0].textcomments = site.xpath('.//i')[1].textexcept:passresult = {'imgUrl': imgUrl,'title': title,'content': content,'vote': vote,'comments': comments,}with self.lock:# print 'write %s' % json.dumps(result)self.f.write(json.dumps(result, ensure_ascii=False).encode('utf-8') + "\n")except Exception, e:print 'site in result', eexcept Exception, e:print 'parse_data', ewith self.lock:total += 1data_queue = Queue()
exitFlag_Parser = False
lock = threading.Lock()
total = 0def main():output = open('qiushibaike.json', 'a')#初始化網頁頁碼page從1-10個頁面pageQueue = Queue(50)for page in range(1, 11):pageQueue.put(page)#初始化采集線程crawlthreads = []crawlList = ["crawl-1", "crawl-2", "crawl-3"]for threadID in crawlList:thread = thread_crawl(threadID, pageQueue)thread.start()crawlthreads.append(thread)#初始化解析線程parserListparserthreads = []parserList = ["parser-1", "parser-2", "parser-3"]#分別啟動parserListfor threadID in parserList:thread = Thread_Parser(threadID, data_queue, lock, output)thread.start()parserthreads.append(thread)# 等待隊列清空while not pageQueue.empty():pass# 等待所有線程完成for t in crawlthreads:t.join()while not data_queue.empty():pass# 通知線程是時候退出global exitFlag_ParserexitFlag_Parser = Truefor t in parserthreads:t.join()print "Exiting Main Thread"with lock:output.close()if __name__ == '__main__':main()
?
?
總結
以上是生活随笔為你收集整理的Python爬虫实战糗事百科实例的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Python:数据提取之JSON与Jso
- 下一篇: Python:Selenium和Phan