當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

scrapy分布式爬虫爬取淘车网

發布時間：2024/1/23 编程问答 30 豆豆

生活随笔收集整理的這篇文章主要介紹了 scrapy分布式爬虫爬取淘车网小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一、master主機配置
1、開啟redis服務器
2、city.py#文件

# 城市編碼 CITY_CODE = ['shijiazhuang', 'tangshan', 'qinhuangdao', 'handan', 'xingtai', 'baoding', 'zhangjiakou','chengde', 'cangzhou', 'langfang', 'hengshui', 'taiyuan', 'datong', 'yangquan', 'changzhi', 'jincheng','shuozhou', 'jinzhong', 'yuncheng', 'xinzhou', 'linfen', 'lvliang', 'huhehaote', 'baotou', 'wuhai','chifeng', 'tongliao', 'eerduosi', 'hulunbeier', 'bayannaoer', 'wulanchabu', 'xinganmeng','xilinguolemeng', 'alashanmeng', 'changchun', 'jilin', 'hangzhou', 'ningbo', 'wenzhou', 'jiaxing','huzhou', 'shaoxing', 'jinhua', 'quzhou', 'zhoushan', 'tz', 'lishui', 'bozhou', 'chizhou', 'xuancheng','nanchang', 'jingdezhen', 'pingxiang', 'jiujiang', 'xinyu', 'yingtan', 'ganzhou', 'jian', 'yichun', 'jxfz','shangrao', 'xian', 'tongchuan', 'baoji', 'xianyang', 'weinan', 'yanan', 'hanzhong', 'yl', 'ankang','shangluo', 'lanzhou', 'jiayuguan', 'jinchang', 'baiyin', 'tianshui', 'wuwei', 'zhangye', 'pingliang','jiuquan', 'qingyang', 'dingxi', 'longnan', 'linxia', 'gannan', 'xining', 'haidongdiqu', 'haibei','huangnan', 'hainanzangzuzizhizho', 'guoluo', 'yushu', 'haixi', 'yinchuan', 'shizuishan', 'wuzhong','guyuan', 'zhongwei', 'wulumuqi', 'kelamayi', 'shihezi', 'tulufandiqu', 'hamidiqu', 'changji', 'boertala','bazhou', 'akesudiqu', 'xinjiangkezhou', 'kashidiqu', 'hetiandiqu', 'yili', 'tachengdiqu', 'aletaidiqu','xinjiangzhixiaxian', 'changsha', 'zhuzhou', 'xiangtan', 'hengyang', 'shaoyang', 'yueyang', 'changde','zhangjiajie', 'yiyang', 'chenzhou', 'yongzhou', 'huaihua', 'loudi', 'xiangxi', 'guangzhou', 'shaoguan','shenzhen', 'zhuhai', 'shantou', 'foshan', 'jiangmen', 'zhanjiang', 'maoming', 'zhaoqing', 'huizhou','meizhou', 'shanwei', 'heyuan', 'yangjiang', 'qingyuan', 'dongguan', 'zhongshan', 'chaozhou', 'jieyang','yunfu', 'nanning', 'liuzhou', 'guilin', 'wuzhou', 'beihai', 'fangchenggang', 'qinzhou', 'guigang','yulin', 'baise', 'hezhou', 'hechi', 'laibin', 'chongzuo', 'haikou', 'sanya', 'sanshashi', 'qiongbeidiqu','qiongnandiqu', 'hainanzhixiaxian', 'chengdu', 'zigong', 'panzhihua', 'luzhou', 'deyang', 'mianyang','guangyuan', 'suining', 'neijiang', 'leshan', 'nanchong', 'meishan', 'yibin', 'guangan', 'dazhou', 'yaan','bazhong', 'ziyang', 'aba', 'ganzi', 'liangshan', 'guiyang', 'liupanshui', 'zunyi', 'anshun','tongrendiqu', 'qianxinan', 'bijiediqu', 'qiandongnan', 'qiannan', 'kunming', 'qujing', 'yuxi', 'baoshan','zhaotong', 'lijiang', 'puer', 'lincang', 'chuxiong', 'honghe', 'wenshan', 'xishuangbanna', 'dali','dehong', 'nujiang', 'diqing', 'siping', 'liaoyuan', 'tonghua', 'baishan', 'songyuan', 'baicheng','yanbian', 'haerbin', 'qiqihaer', 'jixi', 'hegang', 'shuangyashan', 'daqing', 'yc', 'jiamusi', 'qitaihe','mudanjiang', 'heihe', 'suihua', 'daxinganlingdiqu', 'shanghai', 'tianjin', 'chongqing', 'nanjing', 'wuxi','xuzhou', 'changzhou', 'suzhou', 'nantong', 'lianyungang', 'huaian', 'yancheng', 'yangzhou', 'zhenjiang','taizhou', 'suqian', 'lasa', 'changdudiqu', 'shannan', 'rikazediqu', 'naqudiqu', 'alidiqu', 'linzhidiqu','hefei', 'wuhu', 'bengbu', 'huainan', 'maanshan', 'huaibei', 'tongling', 'anqing', 'huangshan', 'chuzhou','fuyang', 'sz', 'chaohu', 'luan', 'fuzhou', 'xiamen', 'putian', 'sanming', 'quanzhou', 'zhangzhou','nanping', 'longyan', 'ningde', 'jinan', 'qingdao', 'zibo', 'zaozhuang', 'dongying', 'yantai', 'weifang','jining', 'taian', 'weihai', 'rizhao', 'laiwu', 'linyi', 'dezhou', 'liaocheng', 'binzhou', 'heze','zhengzhou', 'kaifeng', 'luoyang', 'pingdingshan', 'jiyuan', 'anyang', 'hebi', 'xinxiang', 'jiaozuo','puyang', 'xuchang', 'luohe', 'sanmenxia', 'nanyang', 'shangqiu', 'xinyang', 'zhoukou', 'zhumadian','henanzhixiaxian', 'wuhan', 'huangshi', 'shiyan', 'yichang', 'xiangfan', 'ezhou', 'jingmen', 'xiaogan','jingzhou', 'huanggang', 'xianning', 'qianjiang', 'suizhou', 'xiantao', 'tianmen', 'enshi','hubeizhixiaxian', 'beijing', 'shenyang', 'dalian', 'anshan', 'fushun', 'benxi', 'dandong', 'jinzhou','yingkou', 'fuxin', 'liaoyang', 'panjin', 'tieling', 'chaoyang', 'huludao', 'anhui', 'fujian', 'gansu','guangdong', 'guangxi', 'guizhou', 'hainan', 'hebei', 'henan', 'heilongjiang', 'hubei', 'hunan', 'jl','jiangsu', 'jiangxi', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shandong', 'shanxi', 'shaanxi','sichuan', 'xizang', 'xinjiang', 'yunnan', 'zhejiang', 'jjj', 'jzh', 'zsj', 'csj', 'ygc']# 品牌類型列表 CAR_CODE_LIST = ['southeastautomobile', 'sma', 'audi', 'hummer', 'tianqimeiya', 'seat', 'lamborghini', 'weltmeister','changanqingxingche-281', 'chevrolet', 'fiat', 'foday', 'eurise', 'dongfengfengdu', 'lotus-146', 'jac','enranger', 'bjqc', 'luxgen', 'jinbei', 'sgautomotive', 'jonwayautomobile', 'beijingjeep', 'linktour','landrover', 'denza', 'jeep', 'rely', 'gacne', 'porsche', 'wey', 'shenbao', 'bisuqiche-263','beiqihuansu', 'sinogold', 'roewe', 'maybach', 'greatwall', 'chenggongqiche', 'zotyeauto', 'kaersen','gonow', 'dodge', 'siwei', 'ora', 'lifanmotors', 'cajc', 'hafeiautomobile', 'sol', 'beiqixinnengyuan','dorcen', 'lexus', 'mercedesbenz', 'ford', 'huataiautomobile', 'jmc', 'peugeot', 'kinglongmotor','oushang', 'dongfengxiaokang-205', 'chautotechnology', 'faw-hongqi', 'mclaren', 'dearcc','fengxingauto', 'singulato', 'nissan', 'saleen', 'ruichixinnengyuan', 'yulu', 'isuzu', 'zhinuo','alpina', 'renult', 'kawei', 'cadillac', 'hanteng', 'defu', 'subaru', 'huasong', 'casyc', 'geely','xpeng', 'jlkc', 'sj', 'nanqixinyatu1', 'horki', 'venucia', 'xinkaiauto', 'traum','shanghaihuizhong-45', 'zhidou', 'ww', 'riich', 'brillianceauto', 'galue', 'bugatti','guagnzhouyunbao', 'borgward', 'qzbd1', 'bj', 'changheauto', 'faw', 'saab', 'fuqiautomobile', 'skoda','citroen', 'mitsubishi', 'opel', 'qorosauto', 'zxauto', 'infiniti', 'mazda', 'arcfox-289','jinchengautomobile', 'kia', 'mini', 'tesla', 'gmc-109', 'chery', 'daoda-282', 'joylongautomobile','hafu-196', 'sgmw', 'wiesmann', 'acura', 'yunqueqiche', 'volvo', 'lynkco', 'karry', 'chtc', 'gq','redstar', 'everus', 'kangdi', 'chrysler', 'cf', 'maxus', 'smart', 'maserati', 'dayu', 'besturn','dadiqiche', 'ym', 'huakai', 'buick', 'faradayfuture', 'leapmotor', 'koenigsegg', 'bentley','rolls-royce', 'iveco', 'dongfeng-27', 'haige1', 'ds', 'landwind', 'volkswagen', 'sitech', 'toyota','polarsunautomobile', 'zhejiangkaersen', 'ladaa', 'lincoln', 'weilaiqiche', 'li', 'ferrari', 'jetour','honda', 'barbus', 'morgancars', 'ol', 'sceo', 'hama', 'dongfengfengguang', 'mg-79', 'ktm','changankuayue-283', 'suzuki', 'yudo', 'yusheng-258', 'fs', 'bydauto', 'jauger', 'foton', 'pagani','shangqisaibao', 'guangqihinomotors', 'polestar', 'fujianxinlongmaqichegufenyouxiangongsi','alfaromeo', 'shanqitongjia1', 'xingchi', 'lotus', 'hyundai', 'kaiyi', 'isuzu-132', 'bmw', 'ssangyong','astonmartin']

3、redis_url文件

from taoche.taoche.spiders.city import CITY_CODE, CAR_CODE_LIST from redis import Redis class Redis_url():def __init__(self):#連接客戶端self.re=Redis("localhost",6379)def add(self,url):#將url,利用lpush方法，添加到"taoche:start_urls"中self.re.lpush("taoche:start_urls",url)def flushdb(self):pass rd=Redis_url()#實例化對象 #先將redis中的request全部清空 for city in CITY_CODE:for car_code in CAR_CODE_LIST:rd.add("https://{}.taoche.com/{}/".format(city,car_code))

二、爬取數據的電腦配置
1、settings.py文件配置
添加如下代碼

#調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" #去重 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #redis服務器地址 REDIS_HOST = '10.10.21.13'#要連接的master主機地址 #redis端口號 REDIS_PORT = 6379 ##開啟隊列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

完整settings.py代碼

# -*- coding: utf-8 -*-# Scrapy settings for taoche project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'taoche'SPIDER_MODULES = ['taoche.spiders'] NEWSPIDER_MODULE = 'taoche.spiders' #調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" #去重 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #redis服務器地址 REDIS_HOST = '10.10.21.13'#要連接的master主機地址 #redis端口號 REDIS_PORT = 6379 ##開啟隊列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'taoche (+http://www.yourdomain.com)'#偽裝瀏覽器# Obey robots.txt rules ROBOTSTXT_OBEY = False#非測試模式# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = {'taoche.middlewares.TaocheSpiderMiddleware': 543, } # 中間件 # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'taoche.middlewares.TaocheDownloaderMiddleware': 543, #}# Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'taoche.pipelines.TaochePipeline': 300, }# Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2、spiders文件夾下的taochec.py文件

# -*- coding: utf-8 -*- import scrapy from .city import *#從同級目錄的city.py文件導入所有內容 from lxml import etree from ..items import TaocheItem from scrapy_redis.spiders import RedisSpider import reclass TaochecSpider(RedisSpider):#redis分布式爬蟲name = 'taochec'redis_key = "taoche:start_urls"# class TaochecSpider(scrapy.Spider): # name = 'taochec' # allowed_domains = ['taoche.com'] # start_urls = [] # for city in CITY_CODE[:3]: # for pinpai in CAR_CODE_LIST[:3]: # url=f'https://{city}.taoche.com/{pinpai}/' # start_urls.append(url) # print(url)def parse(self, response):tree = etree.HTML(response.body.decode('utf-8'))# 獲取li列表頁信息li_list = tree.xpath('//ul[@class="gongge_ul"]//li')print(len(li_list))if len(li_list) == 0:passelse:for li_data in li_list:item = TaocheItem()# 獲取標題title = li_data.xpath('./div[@class="gongge_main"]//span/text()')[0]reg_date = li_data.xpath('./div[@class="gongge_main"]/p/i[1]/text()')[0]mile = li_data.xpath('./div[@class="gongge_main"]/p/i[2]/text()')[0]city_name = tree.xpath('//div[@class="nav_statusMain"]//a[2]/text()')[0]price = li_data.xpath('.//div[@class="price"]//i[@class="Total brand_col"]/text()')[0]try:all_price = li_data.xpath('.//div[@class="price"]//i[@class="onepaynor"]/text()')[0]except:all_price = li_data.xpath('.//div[@class="price"]//i[@class="original"]/text()')[0]# 獲取詳情頁的urlbase_url = li_data.xpath('.//div[@class="item_img"]/a/@href')[0]# 拼接urldetail_url = 'https:' + base_urlitem['title'] = titleitem['reg_date'] = reg_dateitem['mile'] = mileitem['city_name'] = city_nameitem['price'] = priceitem['all_price'] = all_priceitem['detail_url'] = detail_urlyield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}, dont_filter=True)page_next = tree.xpath('//a[@class="pages-next"]')if page_next:next_url = tree.xpath('//a[@class="pages-next"]/@href')[0]next_url = 'http:' + next_urlyield scrapy.Request(next_url, callback=self.parse, encoding='utf-8', dont_filter=True)def parse_detail(self,response):item = response.meta["item"]print(response.url)response = response.body.decode('utf-8')tree = etree.HTML(response)# 圖片pic =tree.xpath('//div[@class="taoche-details-xs-picbox"]//ul[@id="taoche-details-xs-pic"]//li[1]/img/@data-src')[0]# 排量displace = tree.xpath('//div[@class="summary-attrs"]//dl[3]/dd/text()')[0]# 車源號source_id = tree.xpath('//span[@class="car-number"]/text()')[0]source_id = source_id.split('：')[-1]item["pic"] = picitem["displace"] = displaceitem["source_id"] = source_iditem["name"] = '天主極樂大帝'yield item

3、items.py文件

import scrapyclass TaocheItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()title = scrapy.Field() # 標題reg_date = scrapy.Field() # 上牌日期mile = scrapy.Field() # 公里數city_name = scrapy.Field() # 城市名稱price = scrapy.Field() # 優惠價格all_price = scrapy.Field() # 全款價格# 詳情頁detail_url =scrapy.Field() # 詳情urlpic = scrapy.Field() # 圖片displace = scrapy.Field() # 排量source_id = scrapy.Field() # 車源號name= scrapy.Field() # 車源號

4、pipelines.py文件

import pymongo class TaochePipeline(object):def __init__(self):self.client=pymongo.MongoClient('10.10.21.13',port=27017)#連接mongo數據庫，建立客戶端對象self.db=self.client['taoche']#連接數據庫self.collection=self.db['taoche']def process_item(self,item,spider):self.collection.insert(dict(item))

三、mongodb數據庫主機配置
1.修改mongo.config文件

在該文件中添加如下代碼

logappend=true journal=true quiet=true port=27017

總結

以上是生活随笔為你收集整理的scrapy分布式爬虫爬取淘车网的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。