日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

抓货车网的图片

發布時間:2023/12/14 编程问答 25 豆豆
生活随笔 收集整理的這篇文章主要介紹了 抓货车网的图片 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
# _*_ coding:UTF-8 _*_ # 開發作者 : ZhangRong z00520111 # 開發時間 : 2020/3/28 10:09 # 文件名稱 : catchhuoche.py # 開發工具 : PyCharm # Description: # Copyright @ Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.# -*- coding: utf-8 -*- import re import requests from pyquery import PyQuery as pq from getcookie import excuteScript import time, random import json import osrequests.packages.urllib3.disable_warnings()# str(content).encode('ISO-8859-1').decode('utf-8') carbrandlist = ['東風', '一汽', '江淮', '三環', '江鈴', '重汽', '福田', '陜汽', '上汽', '凱馬', '長安'] image_num = 0 car_num = 0 # 當前是第幾輛車 prepath = 'E:/pictures/' headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'max-age=0','Connection': 'keep-alive','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' }class HuoCheCrawler():def __init__(self):proxy_list = [#代理設置]# http=random.choice(proxy_list),# https=random.choice(proxy_list)proxies = {"http": random.choice(proxy_list),"https": random.choice(proxy_list)}# print(http)self.baseurl = 'https://www.hcj198.com'self.sess = requests.Session()self.sess.headers = headersself.sess.proxies = proxiesself.start_url = 'https://www.hcj198.com/car.html'# def anti_value(self):# '''# 獲取antipas參數需要的key和value# :return:# '''# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')# params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]# return params# def caculate_antipas(self):# '''# 計算antipas參數# :return:# '''# params = self.anti_value()# antipas = excuteScript(params[0], params[1])# self.sess.cookies.set('antipas', antipas)def page_url(self):# self.caculate_antipas()'''獲取翻頁鏈接:param start_url::return:'''content = pq(self.sess.get(self.start_url, verify=False).text)# print(content)page_num_max = max([int(each.text()) for each in content('div[@class="page-center search_list_one"] ul[@class="pagination"] > li > a').items() ifre.match(r'\d+', each.text())])page_url_list = []for i in range(1, page_num_max + 1, 1):base_url = 'https://www.hcj198.com/car.html?page={}'.format(i)# print("第 %d 頁", i)# print(base_url)page_url_list.append(base_url)return page_url_listdef index_page(self, start_url):'''抓取詳情頁鏈接:param start_url::return:'''# print(start_url)content = pq(self.sess.get(start_url).text)# print('$' * 200)# print(content)for each in content('ul[@class="car-ul"] > li > a').items():# print("each is ",each)url = each.attr.href# print("url is ",url)if not url.startswith('http'):url = self.baseurl + urlyield urldef detail_page(self, detail_url):'''抓取詳情信息:param detail_url::return:'''content = pq(self.sess.get(detail_url).text, parser="html")# print("content is ", content)# tem1 = str(tem('img'))# pattern = r'data-src=["](.*?)["]'# result = re.findall(pattern, tem1)detail = content('ul[@class="tages-param"] li div').text()eachDetail = detail.split(' ')tem = content('div[@class="tages-img-list"]')# print("tem is ", tem)tem1 = str(tem('div'))# print("tem1 is ",tem1)pattern = r'url\("(.*?)&quot'result = re.findall(pattern, tem1)# print("result is ",result)name = content('div[@class="pro-title-cmodel"]').text().strip()for brand in carbrandlist:carbrand = brandif name.find(brand) != -1:break# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')price = content('div[@class="detail-left-dprice"] div[@class="dprice-left"]').text()data_dict = {'name': name,'carbrand': carbrand,'bordingdate': eachDetail[0],'km': eachDetail[3],'displacement': eachDetail[4],'carstyle': eachDetail[1],'price': price[price.index('¥') + 1:],'image': result}if not data_dict['name']:print(str(content).encode('ISO-8859-1').decode('utf-8'))return data_dict, resultdef request_download(self, https, carbrand):global car_numproxy_list = [ #代理設置]# http=random.choice(proxy_list),# https=random.choice(proxy_list)proxies = {"http": random.choice(proxy_list),"https": random.choice(proxy_list)}global image_num# print("http is ",https)r = requests.get(https, proxies=proxies, verify=False)with open(prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(image_num) + '.png', 'wb') as f:f.write(r.content)image_num = image_num + 1def run(self):global car_numfor pageurl in self.page_url():for detail_url in self.index_page(pageurl):# print("datail is ", detail_url)listout, result = self.detail_page(detail_url)data_string = json.dumps(listout, ensure_ascii=False)carbrand = listout['carbrand']filename = carbrand + str(car_num)isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')# 判斷結果if not isExists:# 如果不存在則創建目錄# 創建目錄操作函數os.makedirs(prepath + carbrand + '/' + filename + '/')file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')file.write(data_string)file.close()car_num = car_num + 1print("list is ", listout)stop = 0for https in result:if stop == 7:breakself.request_download(self.baseurl + https, carbrand)stop = stop + 1print("暫停5-15秒,防止被關小黑屋")time.sleep(random.randint(5, 15))print('*' * 200)if __name__ == '__main__':hccrawler = HuoCheCrawler()hccrawler.run()

總結

以上是生活随笔為你收集整理的抓货车网的图片的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。