爬取图片,并按比例划分数据集
生活随笔
收集整理的這篇文章主要介紹了
爬取图片,并按比例划分数据集
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
爬取圖片,并按比例劃分?jǐn)?shù)據(jù)集
上課老師布置的作業(yè),做完保存下來(lái)。
(1)批量爬取不同的10個(gè)明星的圖片各100張;
(2)每個(gè)明星的圖片放一個(gè)文件夾,并按“name_###.jpg”的格式命名,如:liudehua/liudehua_000.jpg;
(3)對(duì)每個(gè)明星的圖片,按7:2:1的比例進(jìn)行劃分train、validation、test數(shù)據(jù)集,分別將圖片名稱保存入train.txt、validation.txt、test.txt;
代碼實(shí)現(xiàn)
# -*- coding: utf-8 -*- import sys import os import re import uuid import requests import random# 從edge瀏覽器獲得的訪問(wèn)image.baidu.com時(shí)的header,可以讓網(wǎng)站認(rèn)為是用戶通過(guò)瀏覽器在訪問(wèn) HEADERS = {'Accept':'text/html, application/xhtml+xml, image/jxr, */*','Accept - Encoding':'gzip, deflate','Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5','Connection':'Keep-Alive','Host':'image.baidu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063' }def download_image(key_word, maximum_download, maximum_failure, headers=HEADERS):"""根據(jù)指定的關(guān)鍵詞和最大下載數(shù)量,從百度圖片搜索并下載圖片Args:key_word: 要搜索的關(guān)鍵詞maximum_download: 最大下載數(shù)量maximum_failure: 最大失敗次數(shù)headers: 請(qǐng)求頭""" download_sum = 0 download_index = 0failure_sum = 0str_gsm = '00'# 把每個(gè)明顯的圖片存放在單獨(dú)一個(gè)文件夾中save_path = './face_image' + '/' + key_wordif not os.path.exists(save_path):os.makedirs(save_path)while download_sum < maximum_download and failure_sum < maximum_failure:str_pn = str(download_index)# 定義百度圖片的搜索URL# url = 'http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%s&pn=%s&gsm=%s&ct=&ic=0&lm=-1&width=0&height=0' % (url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%s&gsm=%s&ct=&ic=0&lm=-1&width=0&height=0' % (key_word, str_pn, str_gsm)print("page url: %s" % (url))try:# 獲取當(dāng)前頁(yè)面的源碼result = requests.get(url, timeout=10, headers=headers).text # timeout請(qǐng)求超時(shí)時(shí)間單位為秒# 獲取當(dāng)前頁(yè)面的圖片URLimg_urls = re.findall('"objURL":"(.*?)",', result, re.S) #匹配字符串,以列表的形式返回匹配到的字符 re.S參數(shù)將這個(gè)字符串作為一個(gè)整體if len(img_urls) < 1:raise ValueError('無(wú)法搜索到圖片,或URL無(wú)效')break# 從圖片URL中逐個(gè)下載for img_url in img_urls:# 獲取圖片內(nèi)容# print("image url: %s" % (img_url))img = requests.get(img_url, timeout=30)m = str(download_index)n = m.zfill(3)img_name = save_path + '/' +key_word+'_'+ n + '.jpg'# print("img name: %s" % (img_name))# 保存圖片with open(img_name, 'wb') as f:f.write(img.content)# with open('image_url_list.txt', 'a+', encoding='utf-8') as f:# f.write(img_name + '\t' + img_url + '\n')download_sum += 1download_index += 1# print('第%d張圖片%s已下載' % (download_sum, img_name))if download_sum >= maximum_download:breakexcept Exception as e:print('【錯(cuò)誤】當(dāng)前圖片無(wú)法下載,%s' % e)failure_sum += 1download_index += 1continueprint('下載完成')def main():# 最大圖片下載數(shù)量和最大允許失敗次數(shù)max_download = 100max_failure = 10# 輸入想搜索的圖片的名字key_word = ['mayun','wangfei','liuxiang','tongliya','luhan','huangbo','zhaobenshan','songxiaobao','liudehua','zhoujielun']# key_word = str(input('輸入想搜索的圖片名稱: '))# 使用明星的名字開始下載圖片for i in key_word:download_image(i, max_download, max_failure)print('全部圖片已下載完成')for i in key_word:dataset = []dataset1 = []test = []File_path = './face_image' + '/' + ifor path in os.listdir(File_path):dataset.append(path)train = random.sample(dataset,70)for image_30 in dataset:if image_30 not in train:dataset1.append(image_30)validation= random.sample(dataset1,20)for image_10 in dataset1:if image_10 not in validation:test.append(image_10) with open('train.txt','a+', encoding='utf-8') as f:for image_path in train:f.write(image_path + '\n') with open('validation.txt','a+', encoding='utf-8') as f:for image_path in validation:f.write(image_path + '\n') with open('test.txt','a+', encoding='utf-8') as f:for image_path in test:f.write(image_path + '\n')print(' 保存完成!') if __name__ == '__main__':main()總結(jié)
以上是生活随笔為你收集整理的爬取图片,并按比例划分数据集的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: PHP GD库 生成图片水印
- 下一篇: WSDM-爱奇艺:用户留存预测挑战赛 线