Python:阳光热线问政平台爬虫
生活随笔
收集整理的這篇文章主要介紹了
Python:阳光热线问政平台爬虫
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
爬取投訴帖子的編號(hào)、帖子的url、帖子的標(biāo)題,和帖子里的內(nèi)容。
items.py
import scrapyclass DongguanItem(scrapy.Item):# 每個(gè)帖子的標(biāo)題title = scrapy.Field()# 每個(gè)帖子的編號(hào)number = scrapy.Field()# 每個(gè)帖子的文字內(nèi)容content = scrapy.Field()# 每個(gè)帖子的urlurl = scrapy.Field()
spiders/sunwz.py
Spider 版本
# -*- coding: utf-8 -*-import scrapy
from dongguan.items import DongguanItemclass SunSpider(CrawlSpider):name = 'sun'allowed_domains = ['wz.sun0769.com']url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='offset = 0start_urls = [url + str(offset)]def parse(self, response):# 取出每個(gè)頁(yè)面里帖子鏈接列表links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract()# 迭代發(fā)送每個(gè)帖子的請(qǐng)求,調(diào)用parse_item方法處理for link in links:yield scrapy.Request(link, callback = self.parse_item)# 設(shè)置頁(yè)碼終止條件,并且每次發(fā)送新的頁(yè)面請(qǐng)求調(diào)用parse方法處理if self.offset <= 71130:self.offset += 30yield scrapy.Request(self.url + str(self.offset), callback = self.parse)# 處理每個(gè)帖子里def parse_item(self, response):item = DongguanItem()# 標(biāo)題item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]# 編號(hào)item['number'] = item['title'].split(' ')[-1].split(":")[-1]# 文字內(nèi)容,默認(rèn)先取出有圖片情況下的文字內(nèi)容列表content = response.xpath('//div[@class="contentext"]/text()').extract()# 如果沒(méi)有內(nèi)容,則取出沒(méi)有圖片情況下的文字內(nèi)容列表if len(content) == 0:content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()# content為列表,通過(guò)join方法拼接為字符串,并去除首尾空格item['content'] = "".join(content).strip()else:item['content'] = "".join(content).strip()# 鏈接item['url'] = response.urlyield item
CrawlSpider 版本
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
import timeclass SunSpider(CrawlSpider):name = 'sun'allowed_domains = ['wz.sun0769.com']start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']# 每一頁(yè)的匹配規(guī)則pagelink = LinkExtractor(allow=('type=4'))# 每個(gè)帖子的匹配規(guī)則contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml')rules = [# 本案例為特殊情況,需要調(diào)用deal_links方法處理每個(gè)頁(yè)面里的鏈接Rule(pagelink, process_links = "deal_links", follow = True),Rule(contentlink, callback = 'parse_item')]# 需要重新處理每個(gè)頁(yè)面里的鏈接,將鏈接里的‘Type&type=4?page=xxx’替換為‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替換為‘Type?page=xxx&type=4’),否則無(wú)法發(fā)送這個(gè)鏈接def deal_links(self, links):for link in links:link.url = link.url.replace("?","&").replace("Type&", "Type?")print link.urlreturn linksdef parse_item(self, response):print response.urlitem = DongguanItem()# 標(biāo)題item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]# 編號(hào)item['number'] = item['title'].split(' ')[-1].split(":")[-1]# 文字內(nèi)容,默認(rèn)先取出有圖片情況下的文字內(nèi)容列表content = response.xpath('//div[@class="contentext"]/text()').extract()# 如果沒(méi)有內(nèi)容,則取出沒(méi)有圖片情況下的文字內(nèi)容列表if len(content) == 0:content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()# content為列表,通過(guò)join方法拼接為字符串,并去除首尾空格item['content'] = "".join(content).strip()else:item['content'] = "".join(content).strip()# 鏈接item['url'] = response.urlyield item
pipelines.py
# -*- coding: utf-8 -*-# 文件處理類庫(kù),可以指定編碼格式
import codecs
import jsonclass JsonWriterPipeline(object):def __init__(self):# 創(chuàng)建一個(gè)只寫文件,指定文本編碼格式為utf-8self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8')def process_item(self, item, spider):content = json.dumps(dict(item), ensure_ascii=False) + "\n"self.filename.write(content)return itemdef spider_closed(self, spider):self.file.close()
settings.py
ITEM_PIPELINES = {'dongguan.pipelines.DongguanPipeline': 300,
}# 日志文件名和處理等級(jí)
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
在項(xiàng)目根目錄下新建main.py文件,用于調(diào)試
from scrapy import cmdline
cmdline.execute('scrapy crawl sunwz'.split())
執(zhí)行程序
py2 main.py
總結(jié)
以上是生活随笔為你收集整理的Python:阳光热线问政平台爬虫的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: Python:Scrapy实战项目手机A
- 下一篇: Mysql中的递归层次查询(父子查询,无