网易新闻爬取
基于requests 模塊
#動態加載的js的數據 import requests import re from lxml import etree import jsonurl = 'https://temp.163.com/special/00804KVA/cm_war.js?callback=data_callback' js_data = requests.get(url=url).text ex = 'data_callback\((.*?)\)' list_str = re.findall(ex,js_data,re.S)[0] list_obj = json.loads(list_str)for dic in list_obj:title = dic['title']detail_url = dic['docurl']page_text = requests.get(url=detail_url).texttree = etree.HTML(page_text)content = tree.xpath('//*[@id="endText"]//text()')content = ''.join(content).replace(' ','').replace('\n','')print(content)下面是scrapy和selenium配合的使用
# spider.py # -*- coding: utf-8 -*- import scrapy from Net163.items import Net163Item from selenium import webdriver from selenium.webdriver import ChromeOptionsclass NetPageSpider(scrapy.Spider):option = ChromeOptions()option.add_experimental_option('excludeSwitches', ['enable-automation'])# 創建一個瀏覽器對象bro = webdriver.Chrome(executable_path=r'C:\spider\scrapy1\chromedriver.exe')name = 'net_page'model_urls = [] # 存放的就是4個板塊對應的詳情頁的url# allowed_domains = ['www.xxx.com']start_urls = ['https://news.163.com']# 內容詳情頁面def content_parse(self,response):item = response.meta['item']# 解析數據 存儲到item里面content_lst = response.xpath('//div[@id="endText"]//text()').extract()# extract 返回的是列表,列表里面存的是字符串item['desc'] = ''.join(content_lst).replace(' ','').replace('\n','').replace('\t','') # 拼接stryield item# 板塊頁面def detail_parse(self, response):div_lst = response.xpath('div[@class="ndi_main"]/div')for div in div_lst:item = Net163Item()title = div.xpath('./div/div[1]/h3/a/text()').extract_first()new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()item['title'] = title# meta是一個字典,字典中所有的鍵值對都可以傳遞給指定的回調函數yield scrapy.Request(url=new_detail_url, callback=self.content_parse,meta={'item':item})# 開始的start_urlsdef parse(self, response):li_lst = response.xpath('//div[@class="ns_area list"]/ul/li')indexs = [3,4,6,7]model_lst = [] #板塊for index in indexs:li = li_lst[index]model_lst.append(li)# 解析板塊urlfor li in model_lst:model_url = li.xpath('./a/@href').extract_first()self.model_urls.append(model_url)# 對每個板塊的url發請求,獲取板塊頁面內容數據yield scrapy.Request(url=model_url,callback=self.detail_parse)# 關閉瀏覽器def closed(self, spider):self.bro.quit()# items.py import scrapy class Net163Item(scrapy.Item):# define the fields for your item here like:title = scrapy.Field()desc = scrapy.Field() #pipelines.py # 管道 負責持久化的 可以txt mysql redis mongodb 可以寫多個類注冊到settings,py里面 class Net163Pipeline(object):def process_item(self, item, spider):print(item['title'],len(item['desc']))return item # settings.py 里面的修改 USER_AGENT = '' # UA檢測 ROBOTSTXT_OBEY = False # robot協議 DOWNLOADER_MIDDLEWARES = { #下載中間件'Net163.middlewares.Net163DownloaderMiddleware': 543, } ITEM_PIPELINES = { #管道類'Net163.pipelines.Net163Pipeline': 300, } LOG_LEVEL = 'ERROR' #日志等級 # middlewares.py里面
# -*- coding: utf-8 -*-from time import sleep from scrapy import signals from scrapy.http import HtmlResponseclass Net163DownloaderMiddleware(object):# 類方法@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s# 處理請求def process_request(self, request, spider):return None# 該方法可以攔截到所有的響應對象,(需求中需要處理的是指定的某些響應對象)def process_response(self, request, response, spider):# 找出指定的響應對象進行處理操作# 可以跟根據指定的請求對象定位到指定的響應對象# 指定的請求對象可以通過請求的url定位model_urls = spider.model_urlsbro = spider.broif request.url in model_urls:# 通過指定的url定位到指定的request# 通過指定的request定位到指定的response(不符合需求的要求)# 自己手動的創建四個符合需求要求的新的響應對象(需要將符合要求的響應數據存儲放置到新的響應對象中)# 使用新的響應對象替換原來原始的響應對象bro.get(request.url) # 使用瀏覽器對4個板塊發請求的urlsleep(2)js = 'window.scrollTo(0,document.body.scrollHeight)'bro.execute_script(js)sleep(2)# 頁面源碼數據就會包含了加載的動態數據page_text = bro.page_source# 手動創建一個新的響應對象,將page_text作為響應數據封裝到該響應對象中# body參數表示的就是響應數據return HtmlResponse(url=bro.current_url,body=page_text,encoding='utf-8',request=request)# 舊的響應對象return response# 處理異常def process_exception(self, request, exception, spider):pass# 開啟爬蟲def spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)
?
轉載于:https://www.cnblogs.com/zhangchen-sx/p/10834494.html
總結
- 上一篇: mysql数据库实现主从复制
- 下一篇: [Usaco2008 Feb]Eatin