當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

【Python】有效资源爬取并集

發(fā)布時間：2023/12/3 python 39 豆豆

生活随笔收集整理的這篇文章主要介紹了【Python】有效资源爬取并集小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

由于爬蟲代碼都不多，
所以我決定在這篇博文上更新所有我覺得比較實用的python代碼
方便以后自己調(diào)用

環(huán)境:python3.7

百度圖片爬蟲

二次元圖片爬取

唐三小說爬取

文件格式命名

百度圖片爬蟲

百度圖片網(wǎng)站

import re import requests from urllib import error from bs4 import BeautifulSoup import osnum = 0 numPicture = 0 file = '' List = []def Find(url):global Listprint('正在檢測圖片總數(shù)，請稍等.....')t = 0i = 1s = 0while t < 1000:Url = url + str(t)try:Result = requests.get(Url, timeout=7)except BaseException:t = t + 60continueelse:result = Result.textpic_url = re.findall('"objURL":"(.*?)",', result, re.S) # 先利用正則表達(dá)式找到圖片urls += len(pic_url)if len(pic_url) == 0:breakelse:List.append(pic_url)t = t + 60return sdef recommend(url):Re = []try:html = requests.get(url)except error.HTTPError as e:returnelse:html.encoding = 'utf-8'bsObj = BeautifulSoup(html.text, 'html.parser')div = bsObj.find('div', id='topRS')if div is not None:listA = div.findAll('a')for i in listA:if i is not None:Re.append(i.get_text())return Redef dowmloadPicture(html, keyword):global num# t =0pic_url = re.findall('"objURL":"(.*?)",', html, re.S) # 先利用正則表達(dá)式找到圖片urlprint('找到關(guān)鍵詞:' + keyword + '的圖片，即將開始下載圖片...')for each in pic_url:print('正在下載第' + str(num + 1) + '張圖片，圖片地址:' + str(each))try:if each is not None:pic = requests.get(each, timeout=7)else:continueexcept BaseException:print('錯誤，當(dāng)前圖片無法下載')continueelse:string = file + r'\\' + keyword + '_' + str(num) + '.jpg'fp = open(string, 'wb')fp.write(pic.content)fp.close()num += 1if num >= numPicture:returnif __name__ == '__main__': # 主函數(shù)入口word = input("請輸入搜索關(guān)鍵詞(可以是人名，地名等): ")#add = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%BC%A0%E5%A4%A9%E7%88%B1&pn=120'url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='tot = Find(url)Recommend = recommend(url) # 記錄相關(guān)推薦print('經(jīng)過檢測%s類圖片共有%d張' % (word, tot))numPicture = int(input('請輸入想要下載的圖片數(shù)量 '))file = input('請建立一個存儲圖片的文件夾，輸入文件夾名稱即可')y = os.path.exists(file)if y == 1:print('該文件已存在，請重新輸入')file = input('請建立一個存儲圖片的文件夾，)輸入文件夾名稱即可')os.mkdir(file)else:os.mkdir(file)t = 0tmp = urlwhile t < numPicture:try:url = tmp + str(t)result = requests.get(url, timeout=10)print(url)except error.HTTPError as e:print('網(wǎng)絡(luò)錯誤，請調(diào)整網(wǎng)絡(luò)后重試')t = t+60else:dowmloadPicture(result.text, word)t = t + 60print('當(dāng)前搜索結(jié)束，感謝使用')print('猜你喜歡')for re in Recommend:print(re, end=' ')

二次元圖片爬取

二次元圖片網(wǎng)站

import urllib.request import re #成功爬取當(dāng)前頁面所有圖片地址def open_url(url):response =urllib.request.urlopen(url)html=response.read()html=html.decode("utf-8")return htmldef get_img(html):par =r'<img class="pic-large" src="(.*?)"'html=re.findall(par,html)for each in html:print(each)filename = each.split("/")[-1]urllib.request.urlretrieve(each,filename,None)if __name__ == '__main__':while(1):word=input(("請輸入所要圖片的首頁網(wǎng)址："))url1=word[:-5]url2=".html"url=url1+url2get_img(open_url(url))for num in range(2,9):url=url1+"_"+str(num)html=".html"text=url+htmlprint(text)get_img(open_url(text))

唐三小說爬取

# coding=utf-8 from bs4 import BeautifulSoup import requests #我這里直接定義了一個類，這樣就可以方便的調(diào)用 class book(object):def __init__(self):self.target="http://www.qiushuge.net/daomubiji2019/"#目錄網(wǎng)址self.names=[]#存放章節(jié)名字self.urls=[]#存放urlself.nums=0#章節(jié)數(shù)#獲取url和章節(jié)數(shù)def getmessage(self):req=requests.get(url=self.target)#發(fā)出request請求，得到數(shù)據(jù)#把內(nèi)容都編碼，防止亂碼問題出現(xiàn)req.encoding=('utf-8')content=req.text#獲取內(nèi)容bf_content=BeautifulSoup(content, "lxml")#對內(nèi)容進(jìn)行處理，以便后續(xù)過濾bf_list=bf_content.find_all('span')#查找所有的span標(biāo)簽下的內(nèi)容如果有class可以加上：class_ = 'showtxt'bf_content2=BeautifulSoup(str(bf_list),"lxml")#再次進(jìn)行過濾，因為我們需要span下的a標(biāo)簽（注意要先轉(zhuǎn)換成str類型）bf_list2=bf_content2.find_all('a')#查找所有的a標(biāo)簽，找到所需要的數(shù)據(jù)for value in bf_list2:#遍歷數(shù)據(jù)，把它放到列表中self.names.append(value.text)#添加數(shù)據(jù) .text就是a標(biāo)簽的數(shù)據(jù)self.urls.append(value.get('href'))#.get可以查找標(biāo)簽里面的參數(shù)self.nums=len(self.names)#獲取總的章節(jié)數(shù)#獲取章節(jié)內(nèi)容def gettext(self,target):req=requests.get(url=target)req.encoding = ('utf-8')content=req.textbf_content=BeautifulSoup(content,"lxml")bf_list=bf_content.find_all('p')val=''for value in bf_list:val+=(' '+value.text+'\n\n')#這里是循環(huán)遍歷內(nèi)容，然后不斷把內(nèi)容拼接起來return val#返回拼接的內(nèi)容def write(self,name,path,text):with open(path,'a',encoding='utf-8') as f:#這里是把內(nèi)容寫到文本中f.write(name+'\n')f.write(text+'\n\n')if __name__ == "__main__":d1=book()#先初始化類d1.getmessage()#執(zhí)行類的函數(shù)print('正在下載《盜墓筆記》...')for value in range(d1.nums):print('正在下載:%s',d1.names[value])d1.write(d1.names[value],'盜墓筆記.txt',d1.gettext(d1.urls[value]))#不斷把每章的內(nèi)容都寫到文文中

文件格式命名

#coding=gbk import os import sys def rename():path=input("請輸入路徑(例如D:\\\\picture)：")name=input("請輸入開頭名:")startNumber=input("請輸入開始數(shù):")fileType=input("請輸入后綴名（如 .jpg、.txt等等）:")print("正在生成以"+name+startNumber+fileType+"迭代的文件名")count=0filelist=os.listdir(path)for files in filelist:Olddir=os.path.join(path,files)if os.path.isdir(Olddir):continueNewdir=os.path.join(path,name+str(count+int(startNumber))+fileType)os.rename(Olddir,Newdir)count+=1print("一共修改了"+str(count)+"個文件")rename()

總結(jié)

以上是生活随笔為你收集整理的【Python】有效资源爬取并集的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇： WordPress的7个最佳在线表单生成
下一篇：【Python】Scrapy的安装与使用

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

python

【Python】有效资源爬取并集

百度圖片爬蟲

二次元圖片爬取

唐三小說爬取

文件格式命名

總結(jié)