爬空气质量MySQL_爬虫:利用selenium采集某某环境网站的空气质量数据
前言:在上一篇文章中,我們介紹了在http://PM2.5.in這個(gè)網(wǎng)站采集空氣質(zhì)量的數(shù)據(jù),本篇文章是對(duì)其產(chǎn)生的一些問題的另一種解決方案,提供更加權(quán)威的數(shù)據(jù)采集。
技術(shù)框架:selenium、json、etree
這里的selenium是一種自動(dòng)化測(cè)試的工具,它可以幫助我們模擬瀏覽器打開網(wǎng)頁(yè)并獲取網(wǎng)頁(yè)數(shù)據(jù),本文之所以選擇這種方式進(jìn)行,是因?yàn)橐詒equests方式直接請(qǐng)求無法獲取到正確的數(shù)據(jù),這個(gè)網(wǎng)頁(yè)的數(shù)據(jù)是動(dòng)態(tài)加載,需要用戶執(zhí)行點(diǎn)擊操作才會(huì)被請(qǐng)求
我們還是按照常規(guī)套路來分析下這個(gè)網(wǎng)站,打開F12,看下這個(gè)網(wǎng)站的數(shù)據(jù)請(qǐng)求
可以發(fā)現(xiàn)這個(gè)網(wǎng)站的數(shù)據(jù)的請(qǐng)求接口,但當(dāng)我們直接用requests去請(qǐng)求這個(gè)接口,會(huì)發(fā)現(xiàn)無法獲取正確的數(shù)據(jù),原因是這個(gè)網(wǎng)站采用了MmEwMD這個(gè)值進(jìn)行了反爬蟲,這個(gè)是一個(gè)比較常見的反爬蟲措施,他這個(gè)值是在發(fā)起請(qǐng)求時(shí)動(dòng)態(tài)生成的,最簡(jiǎn)單的解決這個(gè)問題的辦法就是采用selenium之類的模擬瀏覽器方法進(jìn)行請(qǐng)求,這樣的話,發(fā)出的請(qǐng)求也會(huì)自動(dòng)帶上這個(gè)參數(shù)
請(qǐng)求的代碼如下圖所示
driverPath = 'browser\\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# options.add_argument(('--proxy-server=http://' + ip))
browser = webdriver.Chrome(options=options, executable_path=driverPath)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.get(self.url)
html = browser.page_source
browser.quit()
# print(html)
reponse = etree.HTML(html)
data = reponse.xpath('//body/text()')[0]
json_data = json.loads(data)
我們通過調(diào)用谷歌瀏覽器直接請(qǐng)求對(duì)應(yīng)的頁(yè)面,獲取到數(shù)據(jù)后,關(guān)閉瀏覽器,通過etree解析網(wǎng)頁(yè)結(jié)果,通過觀察發(fā)現(xiàn),我們獲取到的數(shù)據(jù)是json數(shù)組,因此我們使用json解析數(shù)據(jù),然后將對(duì)應(yīng)的數(shù)據(jù)存儲(chǔ)到數(shù)據(jù)庫(kù)
result_list = json_data['data']['hour']
print(result_list)
for result in result_list:
item = dict()
item['affect'] = result['AFFECTINFO']
item['action'] = result['SUGGEST']
if('AQIPRIMPOLLUTE' in result):
item['primary_pollutant'] = result['AQIPRIMPOLLUTE']
else:
item['primary_pollutant'] = '無'
item['AQI'] = result['AQI']
item['PM2.5/1h'] = result['PM25']
item['PM10/1h'] = result['PM10']
item['CO/1h'] = result['CO']
item['NO2/1h'] = result['NO2']
item['O3/1h'] = result['O3']
item['O3/8h'] = result['O3_2']
item['SO2/1h'] = result['SO2']
item['city_name'] = result['POINTNAME']
item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'
item['live_data_time'] = result['MONITORTIME']
item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")
update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')
item['live_data_unit'] = 'μg/m3(CO為mg/m3)'
if(item['city_name'] in city_config):
self.save_mysql(item)
success_count = success_count+1
log_text = '采集的城市:{},采集的結(jié)果:{}'.format(item['city_name'],'成功')
self.save_log({'log_type':'0','log_text':log_text})
self.save_log({'log_type':'3','log_text':log_text})
self.update_spider_time(update_time)
# 存儲(chǔ)運(yùn)行日志
def save_log(self,item):
sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'
values = [item['log_text'],item['log_type'],datetime.datetime.now()]
self.cursor.execute(sql,values)
self.conn.commit()
def save_mysql(self,item):
# 查詢數(shù)據(jù)庫(kù)已存在的數(shù)據(jù)
query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'
values = [item['city_name'],item['live_data_time']]
self.cursor.execute(query_sql,values)
data = self.cursor.fetchone()
# 如果不存在同一城市同一時(shí)刻更新的數(shù)據(jù),則新增
if(data['count'] == 0):
sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"
",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"
") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']
,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']
,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]
self.cursor.execute(sql,values)
self.conn.commit()
其實(shí)當(dāng)初這個(gè)反爬蟲措施也困擾了我一段時(shí)間的,我這里采用的是最簡(jiǎn)單的方法解決,雖然效率不高,但能解決我的需求
完整代碼如下:其中部分代碼是可以不需要的,必須redis和config那個(gè),你們自己改一下,不會(huì)的可以問我,這個(gè)是當(dāng)時(shí)給別人畢設(shè)做的,還有其他功能,所以會(huì)有一些其他的
"""
采集空氣質(zhì)量的數(shù)據(jù)
目標(biāo)網(wǎng)站:http://sthjt.hubei.gov.cn/hjsj/
"""
import requests
from lxml import etree
import re
from xpinyin import Pinyin
import pymysql
import sys
from settings.config import *
from utils import RedisUtil
import datetime
import json
from selenium import webdriver
class kongqizhiliang:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = 'http://sthjt.hubei.gov.cn/wcmapi/service/aqi.xhtml'
redis_key = 'kongqi:config_city'
update_time = 'kongqi:update_time'
# 漢字轉(zhuǎn)拼音
pinyin = Pinyin()
def __init__(self):
self.conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset=charset)
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
# 將城市名轉(zhuǎn)化為code
def get_code(self,city_name):
return self.pinyin.get_pinyin(city_name, '' )
def get_city_config(self):
redis_util = RedisUtil.get_redis()
city_list = redis_util.list_get_range(self.redis_key)
return city_list
def update_spider_time(self,update_time):
redis_util = RedisUtil.get_redis()
redis_util.str_set(self.update_time,update_time)
def get_data(self):
city_config = self.get_city_config()
log_text = '采集開始,準(zhǔn)備采集的城市:{},計(jì)劃采集的數(shù)據(jù)量:{}'.format(city_config,len(city_config))
self.save_log({'log_type':'2','log_text':log_text})
success_count = 0
update_time = ''
driverPath = 'browser\\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# options.add_argument(('--proxy-server=http://' + ip))
browser = webdriver.Chrome(options=options, executable_path=driverPath)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.get(self.url)
html = browser.page_source
browser.quit()
# print(html)
reponse = etree.HTML(html)
data = reponse.xpath('//body/text()')[0]
json_data = json.loads(data)
# print(json_data)
result_list = json_data['data']['hour']
print(result_list)
for result in result_list:
item = dict()
item['affect'] = result['AFFECTINFO']
item['action'] = result['SUGGEST']
if('AQIPRIMPOLLUTE' in result):
item['primary_pollutant'] = result['AQIPRIMPOLLUTE']
else:
item['primary_pollutant'] = '無'
item['AQI'] = result['AQI']
item['PM2.5/1h'] = result['PM25']
item['PM10/1h'] = result['PM10']
item['CO/1h'] = result['CO']
item['NO2/1h'] = result['NO2']
item['O3/1h'] = result['O3']
item['O3/8h'] = result['O3_2']
item['SO2/1h'] = result['SO2']
item['city_name'] = result['POINTNAME']
item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'
item['live_data_time'] = result['MONITORTIME']
item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")
update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')
item['live_data_unit'] = 'μg/m3(CO為mg/m3)'
if(item['city_name'] in city_config):
self.save_mysql(item)
success_count = success_count+1
log_text = '采集的城市:{},采集的結(jié)果:{}'.format(item['city_name'],'成功')
self.save_log({'log_type':'0','log_text':log_text})
self.save_log({'log_type':'3','log_text':log_text})
self.update_spider_time(update_time)
# 存儲(chǔ)運(yùn)行日志
def save_log(self,item):
sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'
values = [item['log_text'],item['log_type'],datetime.datetime.now()]
self.cursor.execute(sql,values)
self.conn.commit()
def save_mysql(self,item):
# 查詢數(shù)據(jù)庫(kù)已存在的數(shù)據(jù)
query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'
values = [item['city_name'],item['live_data_time']]
self.cursor.execute(query_sql,values)
data = self.cursor.fetchone()
# 如果不存在同一城市同一時(shí)刻更新的數(shù)據(jù),則新增
if(data['count'] == 0):
sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"
",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"
") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']
,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']
,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]
self.cursor.execute(sql,values)
self.conn.commit()
if __name__ == "__main__":
app = kongqizhiliang()
app.get_data()
本文首發(fā)于爬蟲:利用selenium采集某某環(huán)境網(wǎng)站的空氣質(zhì)量數(shù)據(jù)?www.bizhibihui.com
總結(jié)
以上是生活随笔為你收集整理的爬空气质量MySQL_爬虫:利用selenium采集某某环境网站的空气质量数据的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 常见排序之——插入排序
- 下一篇: SQL分页查询的介绍以及好处~~