爬虫爬取车主指南各类汽车数据
生活随笔
收集整理的這篇文章主要介紹了
爬虫爬取车主指南各类汽车数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
前言
最近比較關注各類汽車的情況,在車主指南(https://www.icauto.com.cn)看見七月份的汽車前550銷量排行,于是便寫了爬蟲爬取該網頁的表格(https://www.icauto.com.cn/rank),在爬取網頁之后發現各種鏈接網址,并一并分網頁爬取了各種汽車的情況。
原理步驟
1、抓取主網頁,將主網頁的表格內容提取出來。
2、提取各種汽車的數據網址。
3、分頁抓取各類汽車型號等網頁,并將所需要的數據提取出來。
4、將各類數據寫入excel,并保存。
代碼實現
導入需要的庫
import requests from lxml import etree import xlwt import urllib.parse抓取首頁并獲取需要的數據
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'} url = 'https://www.icauto.com.cn/rank/'#主網頁網址 #爬取網頁,對總排行的汽車網頁信息進行數據分析,并用xpath提取想要的信息和每個車的詳細介紹網頁 def page_analyze():res = requests.get(url, headers=headers) #向主機發送請求,獲得網頁內容page_text = res.text #將網頁進行文本轉換#print(type(res.text))#print(res.text)with open('./汽車銷量排行.html', 'w', encoding='utf-8') as fp:fp.write(page_text)with open('./汽車銷量排行.html', 'r', encoding='utf-8') as f:c = f.read()selector = etree.HTML(c)data =[]chexing = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/text()')#print(chexing)pingpai = selector.xpath('//table[@class="bordered"]/tr/td[3]/a[1]/text()')jiage = selector.xpath('//table[@class="bordered"]/tr/td[3]/text()[2]')xiaoliang1 = selector.xpath('//table[@class="bordered"]/tr/td[4]/text()')xiaoliang2 = selector.xpath('//table[@class="bordered"]/tr/td[5]/text()')paihang = selector.xpath('//table[@class="bordered"]/tr/td[1]/text()')herf = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/@href')#主網頁信息提取leixing =[]baozhi =[]guobei =[]chechang = []cheshen =[]抓取各種汽車子網頁并提取汽車類型數據
for m in range(0,550):new_url = urllib.parse.urljoin(url,herf[m])print(new_url)#將獲得的子網頁網址和url進行拼接,獲得需要訪問的網址resp = requests.get(url=new_url, headers=headers)page_te=resp.text#發送請求獲得子網頁內容with open('./汽車銷量排行.html', 'w', encoding='utf-8') as fp:fp.write(page_te)with open('./汽車銷量排行.html', 'r', encoding='utf-8') as f:h = f.read()htm = etree.HTML(h)guo = htm.xpath('//div[@class="carInfo"]/dd[2]/text()')guobei.append(guo)lei = htm.xpath('//div[@class="carInfo"]/dd[3]/text()')leixing.append(lei)chang = htm.xpath('//div[@class="carInfo"]/dd[4]/text()')chechang.append(chang)bao = htm.xpath('//div[@class="carInfo"]/dd[5]/text()')baozhi.append(bao)shen = htm.xpath('//div[@class="carInfo"]/dd[6]/text()')cheshen.append(shen)#提取每個子網頁我們需要的內容,并將其按類分類data.append(paihang)data.append(chexing)data.append(pingpai)data.append(jiage)data.append(xiaoliang1)data.append(xiaoliang2)data.append(guobei)data.append(baozhi)data.append(cheshen)data.append(chechang)data.append(leixing)將數據寫入excel并保存
workbook = xlwt.Workbook('./2022.06汽車銷量排行.xlsx')worksheet = workbook.add_sheet('2022.06汽車銷量排行榜')worksheet.write(0,0,'排名')worksheet.write(0,1,'車型')worksheet.write(0,2,'品牌')worksheet.write(0,3,'指導價格')worksheet.write(0,4,'月銷量')worksheet.write(0,5,'年銷量')worksheet.write(0,6,'國別')worksheet.write(0,7,'保值率')worksheet.write(0,8,'車身結構')worksheet.write(0,9,'車廠')worksheet.write(0,10,'類型')for i in range(0, 11):#print("第%d條" % (i + 1))dat = data[i]for j in range(0, 550):worksheet.write(j+ 1, i, dat[j])workbook.save('./2022.06汽車銷量排行.xlsx')完整代碼
#調用所需要的庫 import requests from lxml import etree import xlwt import urllib.parse from time import sleep #車主指南汽車排行網頁請求參數 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'} url = 'https://www.icauto.com.cn/rank/'#主網頁網址 #爬取網頁,對總排行的汽車網頁信息進行數據分析,并用xpath提取想要的信息和每個車的詳細介紹網頁 def page_analyze():res = requests.get(url, headers=headers) #向主機發送請求,獲得網頁內容page_text = res.text #將網頁進行文本轉換#print(type(res.text))#print(res.text)with open('./汽車銷量排行.html', 'w', encoding='utf-8') as fp:fp.write(page_text)with open('./汽車銷量排行.html', 'r', encoding='utf-8') as f:c = f.read()selector = etree.HTML(c)data =[]chexing = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/text()')#print(chexing)pingpai = selector.xpath('//table[@class="bordered"]/tr/td[3]/a[1]/text()')jiage = selector.xpath('//table[@class="bordered"]/tr/td[3]/text()[2]')xiaoliang1 = selector.xpath('//table[@class="bordered"]/tr/td[4]/text()')xiaoliang2 = selector.xpath('//table[@class="bordered"]/tr/td[5]/text()')paihang = selector.xpath('//table[@class="bordered"]/tr/td[1]/text()')herf = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/@href')#主網頁信息提取leixing =[]baozhi =[]guobei =[]chechang = []cheshen =[]#對各個汽車的網頁信息進行分類提取for m in range(0,550):new_url = urllib.parse.urljoin(url,herf[m])print(new_url)#將獲得的子網頁網址和url進行拼接,獲得需要訪問的網址resp = requests.get(url=new_url, headers=headers)page_te=resp.text#發送請求獲得子網頁內容with open('./汽車銷量排行.html', 'w', encoding='utf-8') as fp:fp.write(page_te)with open('./汽車銷量排行.html', 'r', encoding='utf-8') as f:h = f.read()htm = etree.HTML(h)guo = htm.xpath('//div[@class="carInfo"]/dd[2]/text()')guobei.append(guo)lei = htm.xpath('//div[@class="carInfo"]/dd[3]/text()')leixing.append(lei)chang = htm.xpath('//div[@class="carInfo"]/dd[4]/text()')chechang.append(chang)bao = htm.xpath('//div[@class="carInfo"]/dd[5]/text()')baozhi.append(bao)shen = htm.xpath('//div[@class="carInfo"]/dd[6]/text()')cheshen.append(shen)#提取每個子網頁我們需要的內容,并將其按類分類data.append(paihang)data.append(chexing)data.append(pingpai)data.append(jiage)data.append(xiaoliang1)data.append(xiaoliang2)data.append(guobei)data.append(baozhi)data.append(cheshen)data.append(chechang)data.append(leixing)#將所有內容合并成一大類 #將所獲取的列表存入excel,并對表格進行格式調整workbook = xlwt.Workbook('./2022.06汽車銷量排行.xlsx')worksheet = workbook.add_sheet('2022.06汽車銷量排行榜')worksheet.write(0,0,'排名')worksheet.write(0,1,'車型')worksheet.write(0,2,'品牌')worksheet.write(0,3,'指導價格')worksheet.write(0,4,'月銷量')worksheet.write(0,5,'年銷量')worksheet.write(0,6,'國別')worksheet.write(0,7,'保值率')worksheet.write(0,8,'車身結構')worksheet.write(0,9,'車廠')worksheet.write(0,10,'類型')for i in range(0, 11):#print("第%d條" % (i + 1))dat = data[i]for j in range(0, 550):worksheet.write(j+ 1, i, dat[j])workbook.save('./2022.06汽車銷量排行.xlsx') if __name__ == '__main__':page_analyze()print('實踐小項目圓滿完成,再接再勵!')備注
因為博主比較懶,不喜歡碼字,本博客就只寫具體實現代碼,至于具體實現情況就不再綴訴,大家如果要學習,請參照代碼自行研究。
總結
以上是生活随笔為你收集整理的爬虫爬取车主指南各类汽车数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 三星emcp型号详解_三星emcp型号详
- 下一篇: PR视频剪辑教程--视频特效和转场