php抓取统计局区划代码,抓取国家统计局的代码和名称,爬取,区域,划分,及
from bs4 import BeautifulSoup
import pandas as pd
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
獲取一級代碼、名稱、下一級鏈接
通過設(shè)置參數(shù)originUrl來調(diào)整爬取的年份
def getOneLevelCodeName(originUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'):
web = requests.get(originUrl,headers=headers) #獲取網(wǎng)頁
web.encoding = web.apparent_encoding #設(shè)置編碼
soup = BeautifulSoup(web.text,'html.parser') #解析網(wǎng)頁
provinceList = soup.select('.provincetr') #查找類名為provincetr的內(nèi)容
oneLevelWeb = []
for table in provinceList:
for province in table.select('a'):
oneLevelWeb.append((province['href'],province.text))#獲取下一級短鏈接、獲取省名
oneLevelWebUrl = [(url[0][0:2]+'0000000000','中國-'+url[1],originUrl[0:54]+url[0]) for url in oneLevelWeb] #構(gòu)建區(qū)劃代碼、省名、下一級鏈接
return oneLevelWebUrl
獲取二級代碼、名稱、下一級鏈接
#根據(jù)一級鏈接,獲取下一級
def getSecodLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
secondLevelCodeNameList = soup.select('.citytr')
retList = []
for tag in secondLevelCodeNameList:
if tag.text[12:] == '市轄區(qū)':
retList.append((tag.text[0:12],proLevelName+'-'+proLevelName.split('-')[-1]+tag.text[12:],url[0:54]+tag.select('a')[0]['href']))
else:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:54]+tag.select('a')[0]['href']))
return retList
獲取三級代碼、名稱、下一級鏈接
#根據(jù)二級鏈接,獲取下一級
def getThirdLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
thirdLevelCodeNameList = soup.select('.countytr')
retList = []
for tag in thirdLevelCodeNameList:
try:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:56]+'/'+tag.select('a')[0]['href']))
except:
retList.append((tag.text[0:12],proLevelName+'-'+proLevelName.split('-')[-1]+tag.text[12:],'None'))
return retList
獲取四級代碼、名稱、下一級鏈接
#根據(jù)三級鏈接,獲取下一級
def getFourthLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
fourthLevelCodeNameList = soup.select('.towntr')
retList = []
for tag in fourthLevelCodeNameList:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:60]+tag.select('a')[0]['href']))
return retList
轉(zhuǎn)為DataFrame,輸出excel文件
pd_oneLevel = pd.DataFrame(oneLevel)
pd_oneLevel
pd_secondLevel = pd.concat([pd.DataFrame(data) for data in secondLevel])
pd_secondLevel
pd_thirdLevel = pd.concat([pd.DataFrame(data) for data in thirdLevel])
pd_thirdLevel
pd_fourthLevel = pd.concat([pd.DataFrame(data) for data in fourthLevel])
pd_fourthLevel
pd_allLevel = pd.concat([pd_oneLevel,pd_secondLevel,pd_thirdLevel,pd_fourthLevel],ignore_index=True)
pd_allLevel.columns = ['區(qū)劃代碼','名稱','下一級網(wǎng)址']
pd_allLevel
保存到當前目錄
import os
pd_allLevel.to_excel(r''+os.path.realpath('__file__')[0:-8]+'2018區(qū)劃代碼及名稱.xlsx',index=False)
創(chuàng)作挑戰(zhàn)賽新人創(chuàng)作獎勵來咯,堅持創(chuàng)作打卡瓜分現(xiàn)金大獎總結(jié)
以上是生活随笔為你收集整理的php抓取统计局区划代码,抓取国家统计局的代码和名称,爬取,区域,划分,及的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 怎么查询开户行
- 下一篇: php ip处理函数,PHP取ip地址函