當前位置：首頁 > 编程语言 > python >内容正文

python

python-scrapy-MongoDB 爬取链家二手房

發布時間：2023/12/8 python 22 豆豆

生活随笔收集整理的這篇文章主要介紹了 python-scrapy-MongoDB 爬取链家二手房小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

python-scrapy-MongoDB 爬取鏈家二手房

鏈家二手房房源數據抓取
目標網址為http://bj.lianjia.com/ershoufang/

分析網址

創建項目

scrapy startproject lianjia

創建爬蟲文件

scrapy genspider -t crawl lianjiahouse lianjia.com

3.編寫items.py文件，設置需要抓取的內容

# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass LianjiaItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# 發布信息名稱house_name = scrapy.Field()# 小區名稱community_name = scrapy.Field()# 所在區域#location = scrapy.Field()# 鏈家編號house_record = scrapy.Field()# 總售價total_amount = scrapy.Field()# 單價unit_price = scrapy.Field()# 房屋基本信息# 建筑面積area_total = scrapy.Field()# 套內面積area_use = scrapy.Field()# 廳室戶型house_type = scrapy.Field()# 朝向direction = scrapy.Field()# 裝修情況sub_info = scrapy.Field()# 供暖方式heating_method = scrapy.Field()# 產權#house_property = scrapy.Field()# 樓層floor = scrapy.Field()# 總層高total_floors = scrapy.Field()# 電梯is_left = scrapy.Field()# 梯戶比例left_rate = scrapy.Field()# 戶型結構structure = scrapy.Field()# 房屋交易信息# 掛牌時間release_date = scrapy.Field()# 上次交易時間last_trade_date = scrapy.Field()# 房屋使用年限house_years = scrapy.Field()# 房屋抵押信息pawn = scrapy.Field()# 交易權屬trade_property = scrapy.Field()# 房屋用途house_usage = scrapy.Field()# 產權所有property_own = scrapy.Field()# 圖片地址images_urls = scrapy.Field()# 保存圖片images = scrapy.Field()

編寫pipeline.py，主要編寫兩個pipeline方法:一個用于保存數據，另一個用于處理圖片

# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface from scrapy.pipelines.images import ImagesPipeline from scrapy import Request import pymongoclass LianjiaPipeline(object):# 設置存儲文檔名稱collection_name = 'secondhandhouse'def __init__(self, mongo_uri, mongo_db):self.mongo_uri = mongo_uriself.mongo_db = mongo_db@classmethoddef from_crawler(cls, crawler):return cls(# 通過crawler獲取settings文件，獲取其中的MongoDB配置信息mongo_uri=crawler.settings.get('MONGO_URI'),mongo_db=crawler.settings.get('MONGO_DATABASE','lianjia'))def open_spider(self,spider):# 當爬蟲打開時連接MonoDB數據庫# 先連接Server,再連接指定數據庫self.client = pymongo.MongoClient(self.mongo_uri)self.db = self.client[self.mongo_db]def close_spider(self,spider):# 爬蟲結束時關閉數據庫連接self.client.close()def process_item(self, item, spider):# 將item插入數據庫self.db[self.collection_name].insert(dict(item))return itemclass LianjiaImagePipeline(ImagesPipeline):def get_media_requests(self, item, info):for image_url in item['images_urls']:# 將圖片地址傳入Request，進行下載，同時將item參數添加到Request中yield Request(image_url, meta={'item':item})def file_path(self, request, response=None, info=None, *, item=None):# 從Request中獲取item，以房屋標題作為文件夾名稱item = request.meta['item']image_folder = item['house_name']# 使用圖片URL作為圖片存儲名稱image_guild = request.url.split('/')[-1]# 圖片保存，文件夾/圖片image_save = u'{0}/{1}'.format(image_folder,image_guild)return image_save

在settings.py中激活Pipeline，設置圖片存儲信息、MongoDB數據信息。

# Scrapy settings for lianjia project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'lianjia'SPIDER_MODULES = ['lianjia.spiders'] NEWSPIDER_MODULE = 'lianjia.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'lianjia (+http://www.yourdomain.com)'# Obey robots.txt rules ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = {'lianjia.middlewares.LianjiaSpiderMiddleware': 543, }# Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = {'lianjia.middlewares.LianjiaDownloaderMiddleware': 543, } # 代理IP需要自己去找，我是在[芝麻代理](http://www.zhimaruanjian.com/?utm-source=bdtg&utm-keyword=?246)這個網站找的，每天可以免費領取20個IP，這不是打廣告 PROXY_LIST=['http://182.240.0.146:4245','http://114.96.196.244:4264','http://58.218.201.114:7007','http://59.58.43.88:4235','http://218.95.115.97:4254','http://220.164.105.19:4228','http://125.111.151.110:4205','http://106.125.163.111:4257' ]# Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'lianjia.pipelines.LianjiaPipeline': 300,'lianjia.pipelines.LianjiaImagePipeline':400 } IMAGES_STORE = 'D:\\pycharm\\pych\\scrapy\\lianjia\\images' IMAGES_URLS_FIELD = 'images_urls' IMAGES_RESULT_FIELD = 'images'# MongoDB配置信息 MONGO_URI = 'localhost:27017' MONGO_DATABASE = 'lianjia' # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

在middleware.py寫Spider中間件和Downloader中間件

# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signals import scrapy import random# useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapterclass LianjiaSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.# 利用Scrapy數據收集功能記錄相同小區的數量def __init__(self, stats):self.stats = stats@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.return cls(stats=crawler.stats)def process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, or item objects.# 從item中獲取小區名稱，在數據收集中幾率相同小區的數量for item in result:if isinstance(item,scrapy.Item):# 從result中的item獲取小區名稱community_name = item['community_name']# 在數據統計中為相同的小區增加數值self.stats.inc_value(community_name)yield itemdef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Request or item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)class LianjiaDownloaderMiddleware(object)：# 為請求添加代理def __init__(self, proxy_list):self.proxy_list = proxy_list@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.# 從settings.py中獲取代理列表return cls(proxy_list=crawler.settings.get('PROXY_LIST'))def process_request(self, request, spider):# 從代理列表中隨機選取一個添加至請求proxy = random.choice(self.proxy_list)request.meta['proxy'] = proxydef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

編寫爬蟲文件，其中的Rule要追蹤每條房屋信息的詳細頁面

import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from lianjia.items import LianjiaItemclass SechandHouseSpider(CrawlSpider):name = 'lianjiahouse'allowed_domains = ['lianjia.com']start_urls = ['http://bj.lianjia.com/ershoufang/']rules = (Rule(LinkExtractor(allow='/ershoufang/\d{12}.html'), callback='parse_item'),)def parse_item(self, response):i = LianjiaItem()#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()#item['name'] = response.xpath('//div[@id="name"]').get()#item['description'] = response.xpath('//div[@id="description"]').get()# 二手房名稱i['house_name'] = response.css('title::text').extract_first().replace(' ','')# 所在小區i['community_name'] = response.css('.communityName a::text').extract_first()# 所在區域#鏈家編號i['house_record'] = response.css('.houseRecord .info::text').extract_first()# 總價i['total_amount'] = response.css('.overview .total::text').extract_first()# 單價i['unit_price'] = response.css('.unitPriceValue::text').extract_first()# 建筑總面積i['area_total'] = response.xpath('//div[@class="base"]//ul/li[3]/text()').re_first('\d+.\d')# 使用面積i['area_use'] = response.xpath('//div[@class="base"]//ul/li[5]/text()').re_first('\d+.\d')# 房屋戶型i['house_type'] = response.xpath('//div[@class="base"]//ul/li[1]/text()').extract_first()# 朝向i['direction'] = response.xpath('//div[@class="base"]//ul/li[7]/text()').extract_first()# 裝修情況i['sub_info'] = response.xpath('//div[@class="base"]//ul/li[9]/text()').extract_first()# 供暖方式i['heating_method']= response.xpath('//div[@class="base"]//ul/li[11]/text()').extract_first()# 樓層i['floor'] = response.xpath('//div[@class="base"]//ul/li[2]/text()').extract_first()# 總層高i['total_floors'] = response.xpath('//div[@class="base"]//ul/li[2]/text()').re_first(r'\d+')# 電梯i['is_left'] = response.xpath('//div[@class="base"]//ul/li[12]/text()').extract_first()# 梯戶比例i['left_rate'] = response.xpath('//div[@class="base"]//ul/li[10]/text()').extract_first()# 戶型結構i['structure']= response.xpath('//div[@class="base"]//ul/li[8]/text()').extract_first()# 房屋交易信息# 掛牌時間i['release_date'] = response.xpath('//div[@class="transaction"]//ul/li[1]/span[2]/text()').extract_first()# 上次交易時間i['last_trade_date'] = response.xpath('//div[@class="transaction"]//ul/li[3]/span[2]/text()').extract_first()# 房屋使用年限i['house_years'] = response.xpath('//div[@class="transaction"]//ul/li[5]/span[2]/text()').extract_first()# 房屋抵押信息i['pawn'] = response.xpath('//div[@class="transaction"]//ul/li[7]/span[2]/text()').extract_first().replace(' ','').strip()# 交易權屬i['trade_property']= response.xpath('//div[@class="transaction"]//ul/li[2]/span[2]/text()').extract_first()# 房屋用途i['trade_property'] = response.xpath('//div[@class="transaction"]//ul/li[4]/span[2]/text()').extract_first()# 產權所有i['trade_property'] = response.xpath('//div[@class="transaction"]//ul/li[6]/span[2]/text()').extract_first()# 圖片地址i['images_urls'] = response.css('.smallpic> li::attr(data-pic)').extract()yield i

結果

總結

以上是生活随笔為你收集整理的python-scrapy-MongoDB 爬取链家二手房的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：夜深模拟器不能连接 adb shell
下一篇： python 动态图表大屏_爱了！Py