6、通过xpath获取网页数据
生活随笔
收集整理的這篇文章主要介紹了
6、通过xpath获取网页数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、xpath解析網頁源文件
from urllib import request from lxml import etree # 請求的url url = "http://www.dfenqi.cn/Product/Index" # 請求的頭文件 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } # 創建請求對象 req = request.Request(url,headers = headers) # 創建處理器對象 httpHandler = request.HTTPHandler() # 創建opener opener = request.build_opener(httpHandler) # 發送請求 response = opener.open(req) # 讀取源文件 html = response.read().decode('utf-8') # 創建xpath關系 xpath = "//div[@class='liebiao']/ul/li/p/text()" # 獲取屬性值列表 # xpath = "//div[@class='liebiao']/ul/li/p/@class" # 將html轉換成可解析對象 selector = etree.HTML(html) # 返回xpath查詢列表 goodsList = selector.xpath(xpath) # 顯示商品標題 for goods in goodsList:print(goods)2、xpath解析源文件,并下載圖片至本地
from urllib import request from lxml import etree import osclass Spilder():def __init__(self,pageUrl):# 需要爬取網頁的urlself.pageUrl = pageUrl# 請求頭文件self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}# 請求的處理器self.httpHandler = request.HTTPHandler()# 請求的openerself.opener = request.build_opener(self.httpHandler)def loadPage(self):''' 請求網頁 :return: 返回網頁源文件 '''req = request.Request(self.pageUrl,headers = self.headers)response = self.opener.open(req)return response.read()def getImageUrls(self,html,xpath):''' 根據xpath解析源文件 :param html: 源文件 :param xpath: xpath解析字符串 :return: 解析列表 '''selector = etree.HTML(html)imgUrls = selector.xpath(xpath)return imgUrlsdef loadImage(self,url):''' 下載圖片 :param url: 圖片url :return: 返回圖片數據 '''req = request.Request(url,headers=self.headers)response = self.opener.open(req)return response.read()def writeImage(self,img,imgName):''' 在當前文件夾下面創建image子文件夾,將圖片寫入本地, :param img: 圖片數據 :param imgName: 圖片名稱 :return: '''folderName = os.path.join(os.path.abspath(os.curdir),"image")if not(os.path.isdir(folderName)):os.mkdir(folderName)with open('image/%s' % imgName,'wb') as f:f.write(img)if __name__ == "__main__":url = "http://www.dfenqi.cn/Product/Index"spilder = Spilder(url)html = spilder.loadPage()xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"imgUrls = spilder.getImageUrls(html,xpath)index = 0for url in imgUrls:index += 1img = spilder.loadImage(url)spilder.writeImage(img,'img%s.jpg' % index)轉載于:https://www.cnblogs.com/toloy/p/8618007.html
總結
以上是生活随笔為你收集整理的6、通过xpath获取网页数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 格式工厂软件处理视频
- 下一篇: Swoole练习 Web