下载煎蛋妹子图python代码[自用]
2019獨(dú)角獸企業(yè)重金招聘Python工程師標(biāo)準(zhǔn)>>>
【自用】高舉社會主義精神文明大旗,不建議推廣!
解析html沒有費(fèi)多少精力,主要是下載失敗太多,后來模擬了瀏覽器訪問還是有很多失敗,后來發(fā)現(xiàn)是Referer 頭內(nèi)容不合理導(dǎo)致的。修改后下載成功率 10頁沒有失敗。
當(dāng)然研究下載這里也費(fèi)了些時(shí)間。
最后!自用!我強(qiáng)調(diào)了!自用!為了身體健康和精神文明!
此代碼有版權(quán)!在發(fā)布250毫秒內(nèi)禁止拷貝,傳播,查看!! 250毫秒后版權(quán)過期。特此說明!
20170302 發(fā)現(xiàn)問題有時(shí)下載不了,原因發(fā)現(xiàn),首次發(fā)送后服務(wù)器會返回某個(gè)Etag,要讀取這個(gè)返回給服務(wù)器就能繼續(xù),否則就會出現(xiàn)讀不到文件的錯(cuò)誤。
看來要研究 http協(xié)議了!這個(gè)東西果然沒那么簡單!!
創(chuàng)建了一個(gè)server的字典。每次訪問都記錄下請求頭和響應(yīng)頭,發(fā)現(xiàn)有Etag就反給服務(wù)器。成功率又高了好多。
代碼無任何改動(除了圖片文件保存地址)完整移植到了樹莓派上!
需要注意到是 要安裝樹莓派版本的 lxml 包括兩個(gè)相關(guān)組件( libxml2-dev 和 libxslt1-dev)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # python 3.6 64/win10 64/ # 下一個(gè)目標(biāo)是記錄下載歷史,下載過的圖就不再下載了 ,就用 sqlite3 吧!import urllib import urllib.request as request import io import gzip from lxml import etree # 需要lxml 組件 自己用pip下載import time import datetime import random import traceback import sys from urllib.request import FancyURLopenerdef make_header(call_url):"""構(gòu)成仿造的瀏覽器頭部提高 頁面代碼訪問 成功率"""referer = call_urlif call_url.find('#') > 0:referer = call_url[:len(call_url) - 10]return {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36","Accept": "*/*","Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3","Accept-Encoding": "gzip,deflate","Referer": referer,"Connection": "keep-alive","If-Modified-Since": "Mon, 08 Jul 2013 18:06:40 GMT","Cache-Control": "max-age=0"}def insert_header(header, img_url, call_url):"""構(gòu)成仿造的瀏覽器頭部提高下載成功率"""referer = call_urlif call_url.find('#') > 0:referer = call_url[:len(call_url) - 9]host = img_url[7:21]header.addheader("Host", host)header.addheader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")if img_url.find('.gif') > 0:header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")else:header.addheader("Accept", "*/*")header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")header.addheader("Accept-Encoding", "gzip,deflate")header.addheader("Referer", referer)header.addheader("Connection", "keep-alive")header.addheader("Upgrade-Insecure-Requests", "1")header.addheader("If-Modified-Since", """Sun, 26 Feb 2017 03: 53:17 GMT""")header.addheader('If-None-Match', '"F57B886E1C77028F85FAA6F665CD559E"')header.addheader("Cache-Control", "max-age=0")# 打印請求頭,測試用for i in header.addheaders:print(i.__str__())return header# 以下是幾個(gè)公共參數(shù)enter_url = 'http://jandan.net/ooxx' enter_url1 = 'http://jandan.net/ooxx/page-2293#comments' save_path = 'e:\\xpic\\' download_fail_info = '... download fail' # 最大訪問次數(shù),向后訪問多少頁 最大讀取10頁max_times = 30 curr_times = 0class header_key:head_name = ''header_value = ''class server_info:server_name = ''server_request_header = {}server_response_header = {}server_infos = {}frist_request_header = {'Host': '','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3','Accept-Encoding': 'gzip,deflate','Referer': '','Connection': 'keep-alive','Upgrade-Insecure-Requests': '1','If-Modified-Since': 'Sun, 26 Feb 2017 03: 53:17 GMT','Cache-Control': 'max-age=0'}# frist_request_header.fromkeys('Etag', '"2222"')## frist_request_header['Etag'] = '"333"'## for i in frist_request_header:# print(i, '==', frist_request_header[i])# 一個(gè)單獨(dú)測試下載的img_u = 'http://ww2.sinaimg.cn/mw600/95e71c7fgw1fbdijnrk35j20dw0himz9.jpg' file_n = 'e:\\xpic\\95e71c7fgw1fbdijnrk35j20dw0himz9.jpg'def get_host(url_str):return url_str[7:url_str.find('/', 7)]def get_referer_url(url_str):referer = url_strif url_str.find('#') > 0:referer = url_str[:len(url_str) - 9]return refererdef download_filex(html_url, img_url, file_name):global server_infoscurr_host = get_host(img_url)referer_url = get_referer_url(html_url)s = server_info()if curr_host not in server_infos:s.server_name = curr_hostfor k in frist_request_header:v = frist_request_header[k]s.server_request_header[k] = vs.server_request_header['Host'] = curr_hosts.server_request_header['Referer'] = referer_urlserver_infos[curr_host] = sprint(len(server_infos))# print(s.server_request_header.__str__())# print_server_list()print('add ---- ', s.server_name, ' == ', s.server_request_header['Host'])# print('add ---- ', s)else:s = server_infos[curr_host]s.server_request_header['Referer'] = referer_urlprint('load ---- ', curr_host)print('load ---- ', s.server_name, ' == ', s.server_request_header['Host'])req = request.Request(img_url)for k in s.server_request_header:v = s.server_request_header[k]req.add_header(k, v)# print('request = [%s , %s]' % (k, v))with request.urlopen(req) as f:print('Status:', f.status, f.reason)for k, v in f.getheaders():s.server_response_header[k] = vif k == 'Etag':s.server_request_header['If-None-Match'] = v# print('response = [%s: %s]' % (k, v))with open(file_name, 'wb+') as save_file:save_file.write(f.read())# download_filex(img_u, file_n)def print_server_list():global server_infosprint('------------ server info ------------')for k in server_infos:v = server_infos[k]print('server = [%s]' % (k))for xk in v.server_request_header:xv = v.server_request_header[xk]print('server_request = [%s : %s]'% (xk, xv))def get_html(url):"""這里主要是獲取url的html文件內(nèi)容,可以解壓gzip的html"""print('get ...... ', url)curr_html_str = ''try:req = urllib.request.Request(url, headers=make_header(url))bs = urllib.request.urlopen(req).read()bi = io.BytesIO(bs)gf = gzip.GzipFile(fileobj=bi, mode="rb")curr_html_str = gf.read().decode("utf8")except Exception as ex:print(url, ' ...... ', ex.__str__())finally:passreturn curr_html_strdef make_rando():"""生成一個(gè)隨機(jī) 秒"""sleep_time = random.randint(120, 180)return sleep_timedef sleep():"""用生成的隨機(jī)秒 隨眠"""sleep_time_temp = make_rando()now = datetime.datetime.now()now_time = now.strftime('%Y-%m-%d %H:%M:%S.%f')print(now_time, ' ... sleep ... ', sleep_time_temp)print_server_list()time.sleep(sleep_time_temp)def get_img_url(call_url, html):"""獲取img圖片的url 這里主要是獲取 jpg一類靜態(tài)圖 并 下載"""result = html.xpath('//img/@src')for i in result:img_url = iif img_url[6:13] == 'sinaimg':# 繞開 gif 面圖,此圖不動if img_url.find('thumb') >= 0:continueimg_url = 'http:' + img_urlfile_name = save_path + img_url[img_url.rfind('/')+1:]print(img_url, ' ----> ', file_name)try:# urllib.request.urlretrieve(img_url, new_name)# download_file(call_url, img_url, file_name)download_filex(call_url, img_url, file_name)passexcept Exception as ex:print(file_name, download_fail_info, ex.__str__())finally:passdef get_gif_url(call_url, html):"""這里主要獲取 gif 動態(tài)圖的地址 并 下載"""result = html.xpath('//img/@org_src')for i in result:img_url = iif img_url[6:13] == 'sinaimg':img_url = 'http:' + img_urlfile_name = save_path + img_url[img_url.rfind('/')+1:]print(img_url, ' ----> ', file_name)try:# download_file(call_url, img_url, file_name)download_filex(call_url, img_url, file_name)passexcept Exception as ex:ti = traceback.format_exc()print(file_name, download_fail_info, ex.__str__(), ti)finally:passdef download_file(call_url, img_url, file_name):"""下載文件,模仿了火狐瀏覽器,成功幾率已經(jīng)非常高了"""opener = FancyURLopener()opener.addheaders.clear()opener = insert_header(opener, img_url, call_url)file_data = opener.open(img_url)try:with open(file_name, 'wb+') as save_file:save_file.write(file_data.read())except Exception as ex:print(file_name, ' ...... write fail ', ex.__str__())finally:file_data.close()def get_next_page(current_url):"""入口函數(shù),指定了入口頁,就能繼續(xù)下一頁訪問了"""global curr_timescurrent_html_str = get_html(current_url)if current_html_str is None or current_html_str == '':returncurrent_html = etree.HTML(current_html_str)#get_img_url(current_url, current_html)#get_gif_url(current_url, current_html)#page_result = current_html.xpath('//a[@title="Older Comments"]/@href')curr_times += 1# 沒有下一頁或者達(dá)到最大次數(shù)了就停止了if len(page_result) <= 0 or curr_times >= max_times:returnnext_page_url = page_result[0]# 發(fā)生隨機(jī)睡眠(假裝我在看這一頁內(nèi)容)sleep()#get_next_page(next_page_url)# 功能執(zhí)行g(shù)et_next_page(enter_url)# 一個(gè)單獨(dú)測試下載的 # img_u = 'http://ww2.sinaimg.cn/mw600/95e71c7fgw1fbdijnrk35j20dw0himz9.jpg' # file_n = 'e:\\xpic\\95e71c7fgw1fbdijnrk35j20dw0himz9.jpg' # download_file(img_u, file_n)?
?
轉(zhuǎn)載于:https://my.oschina.net/raddleoj/blog/847367
超強(qiáng)干貨來襲 云風(fēng)專訪:近40年碼齡,通宵達(dá)旦的技術(shù)人生總結(jié)
以上是生活随笔為你收集整理的下载煎蛋妹子图python代码[自用]的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: redis StackExchange
- 下一篇: R语言实战(七)图形进阶