當(dāng)前位置：首頁 > 编程资源 > 综合教程 >内容正文

综合教程

Python-爬取妹子图(单线程和多线程版本)

發(fā)布時(shí)間：2023/12/13 综合教程 31 生活家

生活随笔收集整理的這篇文章主要介紹了 Python-爬取妹子图(单线程和多线程版本) 小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

一、參考文章

Python爬蟲之——爬取妹子圖片

上述文章中的代碼講述的非常清楚，我的基本能思路也是這樣，本篇文章中的代碼僅僅做了一些異常處理和一些日志顯示優(yōu)化工作，寫此文章主要是當(dāng)做筆記，方便以后查閱，修改的地方如下：

1、異常處理下面在代碼中會(huì)單獨(dú)標(biāo)紅

2、多線程版使用了multiprocessing這個(gè)庫，需要在main函數(shù)開始調(diào)用freeze_support()，防止打包成exe之后，運(yùn)行時(shí)創(chuàng)建線程失敗

3、多線程版本加了一個(gè)命令行自定義線程個(gè)數(shù)功能

二、單線程版本

 1 #coding=utf-8
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import os
 5 
 6 all_url = 'http://www.mzitu.com'
 7 
 8 
 9 #http請求頭
10 Hostreferer = {
11     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
12     'Referer':'http://www.mzitu.com'
13                }
14 Picreferer = {
15     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
16     'Referer':'http://i.meizitu.net'
17 }
18 #此請求頭破解盜鏈
19 
20 start_html = requests.get(all_url, headers = Hostreferer)
21 
22 #保存地址
23 path = os.getcwd() + '/mzitu/'
24 
25 #找尋最大頁數(shù)
26 soup = BeautifulSoup(start_html.text, "html.parser")
27 page = soup.find_all('a', class_='page-numbers')
28 max_page = page[-2].text
29 
30 
31 same_url = 'http://www.mzitu.com/page/'
32 for n in range(0, int(max_page)+1):#遍歷頁面數(shù)
33     ul = same_url+str(n)
34     start_html = requests.get(ul, headers = Hostreferer)
35     soup = BeautifulSoup(start_html.text, "html.parser")
36     all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')
37     for a in all_a:#每個(gè)頁面包含的妹子數(shù)
38         title = a.get_text() #提取文本
39         if(title != ''):
40             print("準(zhǔn)備扒取：" + title)
41 
42             #win不能創(chuàng)建帶？的目錄
43             if(os.path.exists(path+title.strip().replace('?', ''))):
44                     #print('目錄已存在')
45                     flag = 1
46             else:
47                 os.makedirs(path+title.strip().replace('?', ''))
48                 flag = 0
49             os.chdir(path + title.strip().replace('?', ''))
50             href = a['href']
51             html = requests.get(href, headers = Hostreferer)
52             mess = BeautifulSoup(html.text, "html.parser")
53             pic_max = mess.find_all('span')
54             pic_max = pic_max[10].text #最大頁數(shù)
55             if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):
56                 print('已經(jīng)保存完畢，跳過')
57                 continue
58             for num in range(1, int(pic_max) + 1):#每個(gè)妹子的所有照片
59                 pic = href+'/'+str(num)
60                 html = requests.get(pic, headers = Hostreferer)
61                 mess = BeautifulSoup(html.text, "html.parser")
62                 pic_url = mess.find('img', alt = title)
63                
64                 if 'src' not in pic_url.attrs:#有些pic_url標(biāo)簽沒有src這個(gè)屬性，導(dǎo)致操作異常，在次進(jìn)行過濾
65                     continue
66                 print(pic_url['src'])
67                 #exit(0)
68                 html = requests.get(pic_url['src'],headers = Picreferer)
69                 file_name = pic_url['src'].split(r'/')[-1]
70                 f = open(file_name, 'wb')
71                 f.write(html.content)
72                 f.close()
73             print('完成')
74     print('第',n,'頁完成')

三、多線程版本

 1 #coding=utf-8
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import os
 5 from multiprocessing import Pool
 6 from multiprocessing import freeze_support
 7 import sys
 8 
 9 header = {
10     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
11     'Referer':'http://www.mzitu.com'
12     }
13 Picreferer = {
14     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
15     'Referer':'http://i.meizitu.net'
16 }
17 
18 def find_MaxPage():
19     all_url = 'http://www.mzitu.com'
20     start_html = requests.get(all_url, headers = header)
21     #找尋最大妹子頁面數(shù)
22     soup = BeautifulSoup(start_html.text, "html.parser")
23     page = soup.find_all('a', class_ = 'page-numbers')
24     max_page = page[-2].text
25     return max_page
26 
27 def Download(href, title, path):
28     html = requests.get(href, headers = header)
29     soup = BeautifulSoup(html.text, 'html.parser')
30     pic_max = soup.find_all('span')
31     pic_max = pic_max[10].text  # 最大頁數(shù)
32     if(os.path.exists(path+title.strip().replace('?', '')) 
33     and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):
34         print('妹子已待命，繼續(xù)準(zhǔn)備下一個(gè)妹子' + title)
35         return 1
36     print(f"發(fā)現(xiàn)妹子資源{pic_max}個(gè)，準(zhǔn)備中：" + title)
37     os.makedirs(path + title.strip().replace('?', ''))
38     os.chdir(path + title.strip().replace('?', ''))
39     for num in range(1, int(pic_max) + 1):
40         pic = href + '/' + str(num)
41         html = requests.get(pic, headers = header)
42         mess = BeautifulSoup(html.text, "html.parser")
43         pic_url = mess.find('img', alt = title)
44         if 'src' not in pic_url.attrs:#有些pic_url標(biāo)簽沒有src屬性，導(dǎo)致操作異常，在次進(jìn)行過濾
45             continue
46         print(f"{title}：{pic_url['src']}")
47         html = requests.get(pic_url['src'], headers = header)
48         file_name = pic_url['src'].split(r'/')[-1]
49         f = open(file_name,'wb')
50         f.write(html.content)
51         f.close()
52     print('妹子已就緒，客官請慢用：' + title)
53 
54 if __name__ == '__main__':
55     freeze_support()#防止打包后 運(yùn)行exe創(chuàng)建進(jìn)程失敗
56     
57     #線程池中線程數(shù)
58     count = 1
59     if len(sys.argv) >=2:
60         count = int(sys.argv[1])
61         
62     pool = Pool(count)
63     print(f'初始化下載線程個(gè)數(shù)${count}')
64 
65     # http請求頭
66     path = os.getcwd() + '/mzitu_mutil/'
67     max_page = find_MaxPage() #獲取最大頁數(shù)  即生成的文件夾數(shù)量
68     print(f'捕獲{max_page}頁妹子，請耐心等待下載完成')
69     same_url = 'http://www.mzitu.com/page/'
70 
71     for n in range(1, int(max_page) + 1):
72         each_url = same_url + str(n)
73         start_html = requests.get(each_url, headers = header)#請求一頁中的所有妹子
74         soup = BeautifulSoup(start_html.text, "html.parser")
75         all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')
76         for a in all_a:#遍歷每一頁中的妹子
77             title = a.get_text()  # 提取文本
78             if (title != ''):
79                 href = a['href']#請求妹子的所有圖集
80                 pool.apply_async(Download, args = (href, title, path))
81     pool.close()
82     pool.join()
83     print('所有妹子已就緒，客官請慢用')

四、資源下載

　　資源下載地址：Python爬取妹子圖-單線程和多線程版本

轉(zhuǎn)載聲明：本站文章無特別說明，皆為原創(chuàng)，版權(quán)所有，轉(zhuǎn)載請注明：朝十晚八

總結(jié)

以上是生活随笔為你收集整理的Python-爬取妹子图(单线程和多线程版本)的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： Android Glide图片加载框架（
下一篇：基于蓝墨云班课的翻转课堂实践