當前位置：首頁 > 运维知识 > 数据库 >内容正文

数据库

爬空气质量MySQL_爬虫：利用selenium采集某某环境网站的空气质量数据

發布時間：2025/3/12 数据库 23 豆豆

生活随笔收集整理的這篇文章主要介紹了爬空气质量MySQL_爬虫：利用selenium采集某某环境网站的空气质量数据小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

前言：在上一篇文章中，我們介紹了在http://PM2.5.in這個網站采集空氣質量的數據，本篇文章是對其產生的一些問題的另一種解決方案，提供更加權威的數據采集。

技術框架：selenium、json、etree

這里的selenium是一種自動化測試的工具，它可以幫助我們模擬瀏覽器打開網頁并獲取網頁數據，本文之所以選擇這種方式進行，是因為以requests方式直接請求無法獲取到正確的數據，這個網頁的數據是動態加載，需要用戶執行點擊操作才會被請求

我們還是按照常規套路來分析下這個網站，打開F12,看下這個網站的數據請求

可以發現這個網站的數據的請求接口，但當我們直接用requests去請求這個接口，會發現無法獲取正確的數據，原因是這個網站采用了MmEwMD這個值進行了反爬蟲，這個是一個比較常見的反爬蟲措施，他這個值是在發起請求時動態生成的，最簡單的解決這個問題的辦法就是采用selenium之類的模擬瀏覽器方法進行請求，這樣的話，發出的請求也會自動帶上這個參數

請求的代碼如下圖所示

driverPath = 'browser\\chromedriver.exe'

options = webdriver.ChromeOptions()

options.add_experimental_option("excludeSwitches", ["enable-automation"])

options.add_experimental_option('useAutomationExtension', False)

# options.add_argument(('--proxy-server=http://' + ip))

browser = webdriver.Chrome(options=options, executable_path=driverPath)

browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {

"source": """

Object.defineProperty(navigator, 'webdriver', {

get: () => undefined

})

"""

})

browser.get(self.url)

html = browser.page_source

browser.quit()

# print(html)

reponse = etree.HTML(html)

data = reponse.xpath('//body/text()')[0]

json_data = json.loads(data)

我們通過調用谷歌瀏覽器直接請求對應的頁面，獲取到數據后，關閉瀏覽器，通過etree解析網頁結果，通過觀察發現，我們獲取到的數據是json數組，因此我們使用json解析數據，然后將對應的數據存儲到數據庫

result_list = json_data['data']['hour']

print(result_list)

for result in result_list:

item = dict()

item['affect'] = result['AFFECTINFO']

item['action'] = result['SUGGEST']

if('AQIPRIMPOLLUTE' in result):

item['primary_pollutant'] = result['AQIPRIMPOLLUTE']

else:

item['primary_pollutant'] = '無'

item['AQI'] = result['AQI']

item['PM2.5/1h'] = result['PM25']

item['PM10/1h'] = result['PM10']

item['CO/1h'] = result['CO']

item['NO2/1h'] = result['NO2']

item['O3/1h'] = result['O3']

item['O3/8h'] = result['O3_2']

item['SO2/1h'] = result['SO2']

item['city_name'] = result['POINTNAME']

item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'

item['live_data_time'] = result['MONITORTIME']

item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")

update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')

item['live_data_unit'] = 'μg/m3(CO為mg/m3)'

if(item['city_name'] in city_config):

self.save_mysql(item)

success_count = success_count+1

log_text = '采集的城市:{},采集的結果:{}'.format(item['city_name'],'成功')

self.save_log({'log_type':'0','log_text':log_text})

self.save_log({'log_type':'3','log_text':log_text})

self.update_spider_time(update_time)

# 存儲運行日志

def save_log(self,item):

sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'

values = [item['log_text'],item['log_type'],datetime.datetime.now()]

self.cursor.execute(sql,values)

self.conn.commit()

def save_mysql(self,item):

# 查詢數據庫已存在的數據

query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'

values = [item['city_name'],item['live_data_time']]

self.cursor.execute(query_sql,values)

data = self.cursor.fetchone()

# 如果不存在同一城市同一時刻更新的數據，則新增

if(data['count'] == 0):

sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"

",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"

") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")

values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']

,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']

,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]

self.cursor.execute(sql,values)

self.conn.commit()

其實當初這個反爬蟲措施也困擾了我一段時間的，我這里采用的是最簡單的方法解決，雖然效率不高，但能解決我的需求

完整代碼如下：其中部分代碼是可以不需要的，必須redis和config那個，你們自己改一下，不會的可以問我，這個是當時給別人畢設做的，還有其他功能，所以會有一些其他的

"""

采集空氣質量的數據

目標網站：http://sthjt.hubei.gov.cn/hjsj/

"""

import requests

from lxml import etree

import re

from xpinyin import Pinyin

import pymysql

import sys

from settings.config import *

from utils import RedisUtil

import datetime

import json

from selenium import webdriver

class kongqizhiliang:

DEFAULT_REQUEST_HEADERS = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Language': 'en',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

}

url = 'http://sthjt.hubei.gov.cn/wcmapi/service/aqi.xhtml'

redis_key = 'kongqi:config_city'

update_time = 'kongqi:update_time'

# 漢字轉拼音

pinyin = Pinyin()

def __init__(self):

self.conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset=charset)

self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)

# 將城市名轉化為code

def get_code(self,city_name):

return self.pinyin.get_pinyin(city_name, '' )

def get_city_config(self):

redis_util = RedisUtil.get_redis()

city_list = redis_util.list_get_range(self.redis_key)

return city_list

def update_spider_time(self,update_time):

redis_util = RedisUtil.get_redis()

redis_util.str_set(self.update_time,update_time)

def get_data(self):

city_config = self.get_city_config()

log_text = '采集開始,準備采集的城市:{},計劃采集的數據量:{}'.format(city_config,len(city_config))

self.save_log({'log_type':'2','log_text':log_text})

success_count = 0

update_time = ''

driverPath = 'browser\\chromedriver.exe'

options = webdriver.ChromeOptions()

options.add_experimental_option("excludeSwitches", ["enable-automation"])

options.add_experimental_option('useAutomationExtension', False)

# options.add_argument(('--proxy-server=http://' + ip))

browser = webdriver.Chrome(options=options, executable_path=driverPath)

browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {

"source": """

Object.defineProperty(navigator, 'webdriver', {

get: () => undefined

})

"""

})

browser.get(self.url)

html = browser.page_source

browser.quit()

# print(html)

reponse = etree.HTML(html)

data = reponse.xpath('//body/text()')[0]

json_data = json.loads(data)

# print(json_data)

result_list = json_data['data']['hour']

print(result_list)

for result in result_list:

item = dict()

item['affect'] = result['AFFECTINFO']

item['action'] = result['SUGGEST']

if('AQIPRIMPOLLUTE' in result):

item['primary_pollutant'] = result['AQIPRIMPOLLUTE']

else:

item['primary_pollutant'] = '無'

item['AQI'] = result['AQI']

item['PM2.5/1h'] = result['PM25']

item['PM10/1h'] = result['PM10']

item['CO/1h'] = result['CO']

item['NO2/1h'] = result['NO2']

item['O3/1h'] = result['O3']

item['O3/8h'] = result['O3_2']

item['SO2/1h'] = result['SO2']

item['city_name'] = result['POINTNAME']

item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'

item['live_data_time'] = result['MONITORTIME']

item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")

update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')

item['live_data_unit'] = 'μg/m3(CO為mg/m3)'

if(item['city_name'] in city_config):

self.save_mysql(item)

success_count = success_count+1

log_text = '采集的城市:{},采集的結果:{}'.format(item['city_name'],'成功')

self.save_log({'log_type':'0','log_text':log_text})

self.save_log({'log_type':'3','log_text':log_text})

self.update_spider_time(update_time)

# 存儲運行日志

def save_log(self,item):

sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'

values = [item['log_text'],item['log_type'],datetime.datetime.now()]

self.cursor.execute(sql,values)

self.conn.commit()

def save_mysql(self,item):

# 查詢數據庫已存在的數據

query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'

values = [item['city_name'],item['live_data_time']]

self.cursor.execute(query_sql,values)

data = self.cursor.fetchone()

# 如果不存在同一城市同一時刻更新的數據，則新增

if(data['count'] == 0):

sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"

",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"

") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")

values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']

,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']

,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]

self.cursor.execute(sql,values)

self.conn.commit()

if __name__ == "__main__":

app = kongqizhiliang()

app.get_data()

本文首發于爬蟲：利用selenium采集某某環境網站的空氣質量數據?www.bizhibihui.com

總結

以上是生活随笔為你收集整理的爬空气质量MySQL_爬虫：利用selenium采集某某环境网站的空气质量数据的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：常见排序之——插入排序
下一篇： Spring Boot——@Config