當(dāng)前位置:
首頁 >
python爬虫实战(一)~爬取百度百科人物的文本+图片信息+Restful api接口
發(fā)布時(shí)間:2025/4/5
20
豆豆
生活随笔
收集整理的這篇文章主要介紹了
python爬虫实战(一)~爬取百度百科人物的文本+图片信息+Restful api接口
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
我的github地址:https://github.com/yuyongsheng1990/python_spider_from_bdbaike
# -*- coding: UTF-8 -*- # @Project -> File: python_spider_from_bdbaike -> spider_baike_text_picture # @Time: 2021/6/3 20:13 # @Author: Yu Yongsheng # @Description: 從百度百科爬取人物的基本信息、信息框數(shù)據(jù)和圖片 import os from urllib.request import urlretrieve import urllib.parse from urllib.error import HTTPErrorimport requests from bs4 import BeautifulSoup from lxml import etree import re import xlwt import xlrd from xlutils.copy import copy# 防止ssl報(bào)錯(cuò) import ssl ssl._create_default_https_context = ssl._create_unverified_context# 爬蟲程序 def claw(content):# 訪問、下載html網(wǎng)頁url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content) # 請(qǐng)求地址# 請(qǐng)求頭部,偽造瀏覽器,防止爬蟲被反headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}# 利用請(qǐng)求地址和請(qǐng)求頭部構(gòu)造請(qǐng)求對(duì)象req = urllib.request.Request(url=url, headers=headers, method='GET')response = urllib.request.urlopen(req) # 發(fā)送請(qǐng)求,獲得響應(yīng)text = response.read().decode('utf-8') # 讀取響應(yīng),獲得文本# ----------------------------------------------------------------------------------------------------# 解析html網(wǎng)頁soup = BeautifulSoup(text, 'lxml') # 創(chuàng)建soup對(duì)象,獲取html源碼intro_tag = soup.find_all('div', class_="lemma-summary") # 獲取百科基本信息列表name_tag = soup.find_all('dt', class_="basicInfo-item name") # 找到所有dt標(biāo)簽,返回一個(gè)標(biāo)簽列表value_tag = soup.find_all('dd', class_="basicInfo-item value") # 找到所有dd標(biāo)簽,返回一個(gè)標(biāo)簽列表# 處理基本信息:過濾數(shù)據(jù),去掉空白intro_after_filter = [re.sub('\n+', '', item.get_text()) for item in intro_tag]intro_after_filter = [''.join(i.split()) for i in intro_after_filter] # 去除/0a亂碼# 將字符串列表連成字符串并返回intro_after_filter = ''.join(intro_after_filter)# print(intro_after_filter)# 抽取信息框數(shù)據(jù)profile_info = {}namelist = []valuelist = []for i in name_tag: # 將所有dt標(biāo)簽內(nèi)容存入列表name = i.get_text()name = ''.join(name.split()) # 去除/0a亂碼namelist.append(name)for i in value_tag: # 將所有dd標(biāo)簽內(nèi)容存入列表value = i.get_text().strip(' ')# value = re.sub('\n+', '、', i.get_text()).strip('、') # 老師不讓刪除換行符# value = ''.join(value.split()) # 刪除可能存在的亂嗎/0a,但一塊把空格刪除了,實(shí)際上不需要print(value)valuelist.append(value)for i, j in zip(namelist,valuelist): # 多遍歷循環(huán),zip()接受一系列可迭代對(duì)象作為參數(shù),將對(duì)象中對(duì)應(yīng)的元素打包成一個(gè)個(gè)tuple(元組),然后返回由這些tuples組成的list(列表)。profile_info[i] = j# print(profile_info)# 爬取圖片# 找到所有img標(biāo)簽,返回一個(gè)url的標(biāo)簽列表img_urllist = []resp = requests.get(url=url, headers=headers)content = resp.contentsoup = BeautifulSoup(content, 'lxml')# img_list = soup.select('div .album-wrap')img_list = soup.select('a>div>img')# print(img_list)for img in img_list:try:# src = img.find('img').get('src')src = img.get('src')if re.match(r'https:(.*)image(.*)auto$', src):img_urllist.append(src)except:continue# print(img_urllist)return intro_after_filter, profile_info, img_urllist# 下載爬到的數(shù)據(jù):基本信息、信息框、圖片 def download(name, intro, profile_dict, img_list):project_path = os.getcwd()# print('project_path:' + project_path)# 保存百科基本信息if not os.path.exists('introduction'):os.mkdir('introduction')introduction_file = project_path + '/introduction/' + name + '.txt'# print(introduction_file)if not os.path.exists(introduction_file):with open(introduction_file, 'x') as f:f.write(intro)else:with open(introduction_file, 'w') as f:f.write(intro)# print('introduction輸出完畢')# 保存信息框數(shù)據(jù)到excelif not os.path.exists('profile'):os.mkdir('profile')profile_file = project_path + '/profile/' + 'profile.csv'field_list = ['中文名', '外文名', '別名', '性別', '學(xué)位', '職稱', '國(guó)籍', '民族', '出生地', '籍貫', '出生日期', '逝世日期','星座', '血型', '身高','體重', '畢業(yè)院校', '職業(yè)', '經(jīng)紀(jì)公司', '代表作品', '主要成就', '生肖', '語種', '特長(zhǎng)', '粉絲名']if not os.path.exists(profile_file):workbook = xlwt.Workbook(encoding='utf-8')output_sheet = workbook.add_sheet('profile_sheet', cell_overwrite_ok=True)for i in range(len(field_list)):output_sheet.write(0, i, field_list[i])workbook.save(profile_file)rb = xlrd.open_workbook(profile_file)rows_num = rb.sheet_by_name('profile_sheet').nrows# print(rows_num)wb = copy(rb)output_sheet = wb.get_sheet(0)# print(profile)for i in range(len(field_list)):if profile_dict.get(field_list[i]):output_sheet.write(rows_num, i, profile_dict.get(field_list[i]))else:continueos.remove(profile_file)wb.save(profile_file)# 保存圖片# 請(qǐng)求頭部,偽造瀏覽器,防止爬蟲被反headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}download_limit = 10 # 單個(gè)人物下載的最大圖片數(shù)if not os.path.exists('img'):os.mkdir('img')name_path = project_path + '/img/' + nameif not os.path.exists(name_path):os.mkdir(name_path)count = 1for img_url in img_list:try:response = requests.get(img_url, headers=headers) # 得到訪問的網(wǎng)址content = response.contentfilename = name_path + '/' + name + '_%s.jpg' % countwith open(filename, "wb") as f:# 如果圖片質(zhì)量太差,跳過if len(content) < 1000:continuef.write(content) # 保存圖片response.close()count += 1# 每個(gè)模特最多只下載download_limit張if count > download_limit:breakexcept HTTPError as e: # HTTP響應(yīng)異常處理print(e.reason)if __name__ == '__main__':trigger = Truewhile (trigger):name = '潘建偉' # input('查詢?cè)~語:')intro, profile_dict, img_list = claw(name)download(name, intro, profile_dict, img_list)# print("查詢結(jié)果:%s" % result)trigger = False2. 人物履歷等數(shù)據(jù)按json格式輸出
2.1 json簡(jiǎn)介
json,通信格式,可讀性強(qiáng),卻會(huì)添加冗余空白格 --> separator對(duì)數(shù)據(jù)進(jìn)行壓縮
2.2 json.dumps()方法參數(shù)
- json.dumps()方法:
sort_keys參數(shù):對(duì)dict對(duì)象進(jìn)行排序,我們默認(rèn)dict是無序存放的
一個(gè)合法的json文檔:有大括號(hào){}擴(kuò)起來的對(duì)象(鍵值對(duì));由中括號(hào)[]括起來的數(shù)組
dist_city={1:{"city_id":01,"city_name":"北京","area":["城東區(qū)","城南區(qū)"]},2:{"city_id":2,"city_name":"上海","area":["浦東區(qū)","朝陽區(qū)"]} } {"$schema": "http://json-schema.org/draft-04/schema#","type": "object","properties": {"email": {"type": "string"},"firstName":{"type": "string"},"lastName": {"type": "string"},} }- 中文dict編碼報(bào)錯(cuò),ensure_ascii = False
- skipkeys。dumps存儲(chǔ)dict時(shí),key必須是str,否則TypeError,如果Skipkeys=True-->屏蔽非str的鍵值對(duì)。
- 拒絕json.dumps()方法自動(dòng)排序,sort_keys=False
- json.dumps()方法輸出自動(dòng)換行縮進(jìn)的數(shù)據(jù)格式,indent=4(值為縮進(jìn)量)
3. python實(shí)現(xiàn)Restful框架的Flask接口
Flask官方教程文檔:https://dormousehole.readthedocs.io/en/latest/
python Restful API的Flask開發(fā)教程視頻:https://www.bilibili.com/video/BV1Kx411Q7gE?from=search&seid=11300633336131170833?
# 防止Flask實(shí)現(xiàn)的restful接口返回中文亂碼 app.config['JSON_AS_ASCII'] = False總結(jié)
以上是生活随笔為你收集整理的python爬虫实战(一)~爬取百度百科人物的文本+图片信息+Restful api接口的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python编程基础(二)~python
- 下一篇: python爬虫基础(二)~工具包: 下