Python网络爬虫--Scrapy使用IP代理池
生活随笔
收集整理的這篇文章主要介紹了
Python网络爬虫--Scrapy使用IP代理池
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
自動更新IP池
寫個自動獲取IP的類proxies.py,執行一下把獲取的IP保存到txt文件中去:
代碼
# *-* coding:utf-8 *-* import requests from bs4 import BeautifulSoup import lxml from multiprocessing import Process, Queue import random import json import time import requestsclass Proxies(object):"""docstring for Proxies"""def __init__(self, page=3):self.proxies = []self.varify_pro = []self.page = pageself.headers = {'Accept': '*/*','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ((KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36','Accept-Encoding': 'gzip, deflate, sdch','Accept-Language': 'zh-CN,zh;q=0.8'}self.get_proxies()self.get_proxies_nn()def get_proxies(self):page = random.randint(1, 10)page_stop = page + self.pagewhile page < page_stop:url = 'http://www.xicidaili.com/nt/%d' % pagehtml = requests.get(url, headers=self.headers).contentsoup = BeautifulSoup(html, 'lxml')ip_list = soup.find(id='ip_list')for odd in ip_list.find_all(class_='odd'):protocol = odd.find_all('td')[5].get_text().lower()+'://'self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))page += 1def get_proxies_nn(self):page = random.randint(1, 10)page_stop = page + self.pagewhile page < page_stop:url = 'http://www.xicidaili.com/nn/%d' % pagehtml = request.get(url, headers=self.headers).contentsoup = BeautifulSoup(html, 'lxml')ip_list = soup.find(id='ip_list')for odd in ip_list.find_all(class_='odd'):protocol = odd.find_all('td')[5].get_text().lower() + '://'self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))page += 1def verify_proxies(self):# 沒驗證的代理old_queue = Queue()# 驗證后的代理new_queue = Queue()print('verify proxy ......')works = []for _ in range(15):works.append(Process(target=self.verify_one_proxy, args=(old_queue, new_queue)))for work in works:work.start()for proxy in self.proxies:old_queue.put(proxy)for work in works:old_queue.put(0)for work in works:work.join()self.proxies = []while 1:try:self.proxies.append(new_queue.get(timeout=1))except:breakprint('verfiy_proxies done!')def verify_one_proxy(self, old_queue, new_queue):while 1:proxy = old_queue.get()if proxy == 0: breakprotocol = 'https' if 'https' in proxy else 'http'proxies = {protocol: proxy}try:if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:print('success %s' % proxy)new_queue.put(proxy)except:print('fail %s' % proxy)if __name__ == '__main__':a = Proxies()a.verify_proxies()print(a.proxies)proxies = a.proxieswith open('proxies.txt', 'a') as f:for proxy in proxies:f.write(proxy+'\n')執行一下:
python proxies.py
這些有效的IP就會保存到proxies.txt文件中去
修改中間件middlewares.py的內容如下:
import random import scrapy from scrapy import log# logger = logging.getLogger()class ProxyMiddleWare(object):"""docstring for ProxyMiddleWare"""def process_request(self, request, spider):'''對request對象加上proxy'''proxy = self.get_random_proxy()print("this is request ip:" + proxy)request.meta['proxy'] = proxydef process_response(self, request, response, spider):'''對返回的response處理'''#如果返回的response狀態不是200,重新生成當前request對象if response.status != 200:proxy = self.get_random_proxy()print("this is response ip:" + proxy)# 對當前request加上代理request.meta['proxy'] = proxyreturn requestreturn responsedef get_random_proxy(self):'''隨機從文件中讀取proxy'''while 1:with open('你保存的\proxies.txt', 'r') as f:proxies = f.readlines()if proxies:break;else:time.sleep(1)proxy = random.choice(proxies).strip()return proxy修改下settings文件
DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':None, 'myproxies.middlewares.ProxyMiddleWare':125, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware':None }這里的myproxies是工程的名字,middlewares是py文件的名,ProxyMiddleWare是類的名字
或者利用crawlera神器(收費)--------自行百度
土豪自然隨意,直接淘寶買一些代理IP就好,穩定也不是特別貴
Github 搜索 proxy ip會發現更多
創作挑戰賽新人創作獎勵來咯,堅持創作打卡瓜分現金大獎總結
以上是生活随笔為你收集整理的Python网络爬虫--Scrapy使用IP代理池的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: builtins.ModuleNotFo
- 下一篇: websocket python爬虫_p