python开发自己的工具包_爬虫开发python工具包介绍 (4)
本文來自網易云社區
作者:王濤
此處我們給出幾個常用的代碼例子,包括get,post(json,表單),帶證書訪問:
Get 請求@gen.coroutine
def?fetch_url():
try:
c?=?CurlAsyncHTTPClient()??#?定義一個httpclient
myheaders?=?{
"Host":?"weixin.sogou.com",
"Connection":?"keep-alive",
"Cache-Control":?"max-age=0",
"Upgrade-Insecure-Requests":?"1",
"User-Agent":?"Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US)?AppleWebKit/532.5?(KHTML,?like?Gecko)?Chrome/4.0.249.0?Safari/532.5?",
"Accept":?"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":?"gzip,?deflate",
"Accept-Language":?"zh-CN,zh;q=0.9,en;q=0.8"
}
url?=?"http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
req?=?HTTPRequest(url=url,?method="GET",?headers=myheaders,?follow_redirects=True,?request_timeout=20,?connect_timeout=10,
proxy_host="127.0.0.1",
proxy_port=8888)
response?=?yield?c.fetch(req)??#?發起請求
print?response.code
print?response.body
IOLoop.current().stop()??#?停止ioloop線程
except:
print?traceback.format_exc()
Fiddler 抓到的報文請求頭:
POST ?JSON數據請求@gen.coroutine
def?fetch_url():
"""抓取url"""
try:
c?=?CurlAsyncHTTPClient()??#?定義一個httpclient
myheaders?=?{
"Host":?"weixin.sogou.com",
"Connection":?"keep-alive",
"Cache-Control":?"max-age=0",
"Upgrade-Insecure-Requests":?"1",
"User-Agent":?"Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US)?AppleWebKit/532.5?(KHTML,?like?Gecko)?Chrome/4.0.249.0?Safari/532.5?",
"Accept":?"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":?"gzip,?deflate",
"Content-Type":?"Application/json",
"Accept-Language":?"zh-CN,zh;q=0.9,en;q=0.8"
}
url?=?"http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
body?=json.dumps({"key1":?"value1",?"key2":?"value2"})??#?Json格式數據
req?=?HTTPRequest(url=url,?method="POST",?headers=myheaders,?follow_redirects=True,?request_timeout=20,?connect_timeout=10,
proxy_host="127.0.0.1",proxy_port=8888,body=body)
response?=?yield?c.fetch(req)??#?發起請求
print?response.code
print?response.body
IOLoop.current().stop()??#?停止ioloop線程
except:
print?traceback.format_exc()
Fiddler 抓到的報文請求頭:
POST ?Form表單數據請求@gen.coroutine
def?fetch_url():
"""抓取url"""
try:
c?=?CurlAsyncHTTPClient()??#?定義一個httpclient
myheaders?=?{
"Host":?"weixin.sogou.com",
"Connection":?"keep-alive",
"Cache-Control":?"max-age=0",
"Upgrade-Insecure-Requests":?"1",
"User-Agent":?"Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US)?AppleWebKit/532.5?(KHTML,?like?Gecko)?Chrome/4.0.249.0?Safari/532.5?",
"Accept":?"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":?"gzip,?deflate",
#?"Content-Type":?"Application/json",
"Accept-Language":?"zh-CN,zh;q=0.9,en;q=0.8"
}
import?urllib
url?=?"http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
body?=urllib.urlencode({"key1":?"value1",?"key2":?"value2"})??#?封裝form表單
req?=?HTTPRequest(url=url,?method="POST",?headers=myheaders,?follow_redirects=True,?request_timeout=20,?connect_timeout=10,
proxy_host="127.0.0.1",proxy_port=8888,body=body)
response?=?yield?c.fetch(req)??#?發起請求
print?response.code
print?response.body
IOLoop.current().stop()??#?停止ioloop線程
except:
print?traceback.format_exc()
Fiddler 抓到的報文請求頭:
添加證書訪問def?fetch_url():
"""抓取url"""
try:
c?=?CurlAsyncHTTPClient()??#?定義一個httpclient
myheaders?=?{
"Host":?"www.amazon.com",
"Connection":?"keep-alive",
"Cache-Control":?"max-age=0",
"Upgrade-Insecure-Requests":?"1",
"User-Agent":?("Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?"
"AppleWebKit/537.36?(KHTML,?like?Gecko)?"
"Chrome/68.0.3440.106?Safari/537.36"),
"Accept":?("text/html,application/xhtml+xml,"
"application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
"Accept-Encoding":?"gzip,?deflate,?br",
"Accept-Language":?"zh-CN,zh;q=0.9,en;q=0.8"
}
import?urllib
url?=?"https://www.amazon.com/"
req?=?HTTPRequest(url=url,?method="GET",?headers=myheaders,?follow_redirects=True,?request_timeout=20,?connect_timeout=10,proxy_host="127.0.0.1",
proxy_port=8888,ca_certs="FiddlerRoot.pem")??#?綁定證書
response?=?yield?c.fetch(req)??#?發起請求
print?response.code
print?response.body
IOLoop.current().stop()??#?停止ioloop線程
except:
print?traceback.format_exc()
Fiddler抓到的報文(說明可以正常訪問)
四、總結
抓取量少的時候,建議使用requests,簡單易用。
并發量大的時候,建議使用tornado,單線程高并發,高效易編程。
以上給出了requests和Fiddler中常用的接口和參數說明,能解決爬蟲面對的大部分問題,包括并發抓取、日常的反爬應對,https網站的抓取。
附上一段我自己的常用抓取代碼邏輯:import?randomfrom?tornado.ioloop?import?IOLoopfrom?tornado?import?genfrom?tornado.queues?import?Queue
import?random
from?tornado.ioloop?import?IOLoop
from?tornado?import?gen
from?tornado.queues?import?Queue
TASK_QUE?=?Queue(maxsize=1000)
def?response_handler(res):
"""?處理應答,一般會把解析的新的url添加到任務隊列中,并且解析出目標數據?"""
pass
@gen.coroutine
def?url_fetcher_without_param():
pass
@gen.coroutine
def?url_fetcher(*args,**kwargs):
global?TASK_QUE
c?=?CurlAsyncHTTPClient()
while?1:
#console_show_log("Let's?spider")
try:
param?=?TASK_QUE.get(time.time()?+?300)?#?5?分鐘超時
except?tornado.util.TimeoutError::
yield?gen.sleep(random.randint(10,100))
continue
try:
req?=?HTTPRequest(url,method=,headers=,....)?#?按需配置參數
response?=?yield?c.fetch(req)
if?response.coe==200:
response_handler(response.body)
except?Exception:
yield?gen.sleep(10)
continue
finally:
print?"I?am?a?slow?spider"
yield?gen.sleep(random.randint(10,100))
@gen.coroutine
def?period_callback():
pass
def?main():
io_loop?=?IOLoop.current()
#?添加并發邏輯1
io_loop.spawn_callback(url_fetcher,?1)
io_loop.spawn_callback(url_fetcher,?2)
io_loop.spawn_callback(url_fetcher_without_param)?#?參數是可選的
#?如果需要周期調用,調用PeriodicCallback:
PERIOD_CALLBACK_MILSEC?=?10??#?10,?單位ms
io_loop.PeriodicCallback(period_callback,).start()
io_loop.start()
if?__name__?==?"__main__":
main()
以上,歡迎討論交流
五、參考:
網易云免費體驗館,0成本體驗20+款云產品!
更多網易研發、產品、運營經驗分享請訪問網易云社區。
總結
以上是生活随笔為你收集整理的python开发自己的工具包_爬虫开发python工具包介绍 (4)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: win7优化设置_5项优化,至少提升20
- 下一篇: python第一周心得_python第一