當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

python爬取全国社会组织查询网站

發(fā)布時(shí)間：2025/4/5 python 36 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬取全国社会组织查询网站小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

# encoding = 'utf-8'

import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# 民政部

def acquire_minzhengbu(to_page):
? ? headers = {
? ? ? ? "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
? ? }
? ? url = "http://www.chinanpo.gov.cn/search/orgcx.html"

? ? data_m = {
? ? ? ? "t": 2,
? ? ? ? "orgName": "%E5%AD%A6%E4%BC%9A",
? ? ? ? "corporateType": 1,
? ? ? ? "status": -1,
? ? ? ? "regNumB": 1,
? ? ? ? "regNumD": 2,
? ? ? ? "tabIndex": 1,
? ? ? ? "regNum": -1,
? ? ? ? "isHyxh": 2,
? ? ? ? "page_flag": "true",
? ? ? ? "pagesize_key": "macList",
? ? ? ? "goto_page": to_page,
? ? ? ? "current_page": 1,
? ? ? ? "total_count": 487,
? ? ? ? "page_size": 20,
? ? ? ? "to_page": to_page}

? ? response = requests.post(url, data=data_m, headers=headers)
? ? time.sleep(5)
? ? html = response.text

? ? soup = BeautifulSoup(html, 'html.parser')

? ? total = []

? ? for i in range(0, 120, 6):
? ? ? ? l = []
? ? ? ? a0 = soup.find_all(id='mac-data')[0].find_all('a')[i].get_text().strip()
? ? ? ? a1 = soup.find_all(id='mac-data')[0].find_all('a')[i + 1].get_text().strip()
? ? ? ? a2 = soup.find_all(id='mac-data')[0].find_all('a')[i + 2].get_text().strip()
? ? ? ? a3 = soup.find_all(id='mac-data')[0].find_all('a')[i + 3].get_text().strip()
? ? ? ? a4 = soup.find_all(id='mac-data')[0].find_all('a')[i + 4].get_text().strip()
? ? ? ? a5 = soup.find_all(id='mac-data')[0].find_all('a')[i + 5].get_text().strip()

? ? ? ? l.append(a0)
? ? ? ? l.append(a1)
? ? ? ? l.append(a2)
? ? ? ? l.append(a3)
? ? ? ? l.append(a4)
? ? ? ? l.append(a5)
? ? ? ? total.append(l)

? ? return total

total_mingzhen = []
for i in range(1,26):
? ? print(i)
? ? try:
? ? ? ? data = acquire_minzhengbu(i)
? ? ? ? total_mingzhen +=data
? ? except:
? ? ? ? print('error:',i)

data11 = pd.DataFrame(total_mingzhen)
data11.columns = ['社會(huì)組織名稱','統(tǒng)一社會(huì)信用編碼','社會(huì)組織類型','法定代表人','成立登記日期','狀態(tài)']

data11.to_csv('./學(xué)會(huì)_民政部登記.csv',encoding='utf-8',index=False)
?

# encoding = 'utf-8'

import requests
from bs4 import BeautifulSoup
import time

def acquire_difang(to_page):
? ? headers = {
? ? ? ? "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
? ? }
? ? url = "http://www.chinanpo.gov.cn/search/orgcx.html"

? ? data_difang = {
? ? ? ? "t": 3,
? ? ? ? "orgName": "%E5%AD%A6%E4%BC%9A",
? ? ? ? "corporateType": 1,
? ? ? ? "status": -1,
? ? ? ? "regNumB": 1,
? ? ? ? "regNumD": 2,
? ? ? ? "tabIndex": 2,
? ? ? ? "regNum": -1,
? ? ? ? "isHyxh": 2,
? ? ? ? "page_flag": "true",
? ? ? ? "pagesize_key": "usciList",
? ? ? ? "goto_page": to_page,
? ? ? ? "current_page": 1,
? ? ? ? "total_count": 30640,
? ? ? ? "page_size": 20,
? ? ? ? "to_page": to_page
? ? }
? ? response = requests.post(url, data=data_difang, headers=headers)
? ? # ? ? time.sleep()
? ? html = response.text

? ? soup = BeautifulSoup(html, 'html.parser')

? ? total = []

? ? for i in range(0, 120, 6):
? ? ? ? l = []
? ? ? ? a0 = soup.find_all(id='local-data')[0].find_all('a')[i].get_text().strip()
? ? ? ? a1 = soup.find_all(id='local-data')[0].find_all('a')[i + 1].get_text().strip()
? ? ? ? a2 = soup.find_all(id='local-data')[0].find_all('a')[i + 2].get_text().strip()
? ? ? ? a3 = soup.find_all(id='local-data')[0].find_all('a')[i + 3].get_text().strip()
? ? ? ? a4 = soup.find_all(id='local-data')[0].find_all('a')[i + 4].get_text().strip()
? ? ? ? a5 = soup.find_all(id='local-data')[0].find_all('a')[i + 5].get_text().strip()

? ? ? ? l.append(a0)
? ? ? ? l.append(a1)
? ? ? ? l.append(a2)
? ? ? ? l.append(a3)
? ? ? ? l.append(a4)
? ? ? ? l.append(a5)
? ? ? ? total.append(l)

? ? return total

total_mingzhen = []

for i in range(1,1533):
? ? print(i)
? ? try:
? ? ? ? data = acquire_difang(i)
? ? ? ? total_mingzhen +=data
? ? except:
? ? ? ? print('error:',i)

data12 = pd.DataFrame(total_mingzhen)
data12.columns = ['社會(huì)組織名稱','統(tǒng)一社會(huì)信用編碼','社會(huì)組織類型','法定代表人','成立登記日期','狀態(tài)']

data12.to_csv('./學(xué)會(huì)_地方登記.csv',encoding='utf-8',index=False)

總結(jié)

以上是生活随笔為你收集整理的python爬取全国社会组织查询网站的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： spark读取hbase数据
下一篇：《程序员代码面试指南第二版》Python