python chrome headless_[技巧] chrome headless 爬虫抓取websoket 数据
目錄
源起
分析
實踐
總結(jié)
源起
周末答應(yīng)了一個朋友幫他看一下一個網(wǎng)站應(yīng)該怎么爬,費話不說直接先上網(wǎng)站
https://datacenter.jin10.com/price
數(shù)據(jù)一直在不停的閃,直覺判斷這種高頻的顯示應(yīng)該不會用ajax 輪詢的方式,至少也是websocket的方式
分析
老規(guī)矩,直接上chrome 的f12來分析看看
直接看到,右邊正邊瘋狂的刷新數(shù)據(jù),而使用的協(xié)議,正是websocket ,對于python下如何連接websocket ,網(wǎng)上有很多的文章,我這里就不用細說了,通常我們拿到這樣的接口,都會本能去嘗試直連看看,在進一步嘗試之后,發(fā)現(xiàn)他們的api應(yīng)該有一種特別的方式(又或者我的代碼有寫錯的地方)
# coding:utf-8
from websocket import create_connection
from websocket import ABNF
api = "wss://sshibikfdn.jin10.com:9084/socket.io/?EIO=3&transport=websocket&sid=VsJvZikGdc8spBaPAAMO"
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cookie': 'UM_distinctid=16614315bf179b-0354a40c6714ff-34677908-232800-16614315bf2acb; XSRF-TOKEN=eyJpdiI6IkNZRU9uSmM1ZnY2M0VqNUttK1pxRGc9PSIsInZhbHVlIjoiRlJpNlRuekIxTDJZeVd3bHpvXC9OUEZGamw4VndZNEdXTEVsRjRMaFFyOEIxUHRtNDdTc1JaQ042eG4xdjlFeWJjWGlkcWFaeWl6NTRVUUlQMThaZmJ3PT0iLCJtYWMiOiJkYWU1MzQ2NjEyM2U3OTk0MzY5NWNjZTdhZmNlZjE0YTViMjc2YzBiYWM4YjhiMjNhZmRjMzU3YzliNDg3ZGIzIn0%3D; laravel_session=eyJpdiI6IjBRS3h0Y29XcGRBRlFIc0xIeWFiZGc9PSIsInZhbHVlIjoibVRLblpNTDJJa1JIN1ZJc0s5c2xrSkYzckNadDB6aGp0REd5SVJQTlkxNVAzajhvdXY5ZElSQ3VTcGVicjNiSXZ3NE9pZDZOdHJUM1d6WG1KQjZXNkE9PSIsIm1hYyI6Ijg3MWVkZDVlMDFjZDM2NDRjZmI2ZDhkNDJmZGI5MjNhMzk3MTViNmI1YTNmMDRmYWJjNzQ4ZGU2YWZhNzNhNzUifQ%3D%3D; io=VsJvZikGdc8spBaPAAMO',
'Host': 'sshibikfdn.jin10.com:9084',
'Connection': 'Upgrade',
'Origin': 'https://datacenter.jin10.com',
'Pragma': 'no-cache',
'Sec-WebSocket-Extensions': 'permessage-deflate; client_max_window_bits',
'Sec-WebSocket-Key': 'g4UA3smEJ0eGufMkyz7AOw==',
'Sec-WebSocket-Version': '13',
'Upgrade': 'websocket',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
def get_web_socket():
start_message = "2probe"
ws = create_connection(
api,
header=headers,
cookie=headers['Cookie'],
#origin=headers['Origin'],
#host=headers['Host']
)
frame = ABNF.create_frame("2probe", ABNF.OPCODE_TEXT)
ws.send_frame(frame)
data = ws.recv_frame()
print(data)
if __name__ == '__main__':
get_web_socket()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# coding:utf-8
fromwebsocketimportcreate_connection
fromwebsocketimportABNF
api="wss://sshibikfdn.jin10.com:9084/socket.io/?EIO=3&transport=websocket&sid=VsJvZikGdc8spBaPAAMO"
headers={
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Cookie':'UM_distinctid=16614315bf179b-0354a40c6714ff-34677908-232800-16614315bf2acb; XSRF-TOKEN=eyJpdiI6IkNZRU9uSmM1ZnY2M0VqNUttK1pxRGc9PSIsInZhbHVlIjoiRlJpNlRuekIxTDJZeVd3bHpvXC9OUEZGamw4VndZNEdXTEVsRjRMaFFyOEIxUHRtNDdTc1JaQ042eG4xdjlFeWJjWGlkcWFaeWl6NTRVUUlQMThaZmJ3PT0iLCJtYWMiOiJkYWU1MzQ2NjEyM2U3OTk0MzY5NWNjZTdhZmNlZjE0YTViMjc2YzBiYWM4YjhiMjNhZmRjMzU3YzliNDg3ZGIzIn0%3D; laravel_session=eyJpdiI6IjBRS3h0Y29XcGRBRlFIc0xIeWFiZGc9PSIsInZhbHVlIjoibVRLblpNTDJJa1JIN1ZJc0s5c2xrSkYzckNadDB6aGp0REd5SVJQTlkxNVAzajhvdXY5ZElSQ3VTcGVicjNiSXZ3NE9pZDZOdHJUM1d6WG1KQjZXNkE9PSIsIm1hYyI6Ijg3MWVkZDVlMDFjZDM2NDRjZmI2ZDhkNDJmZGI5MjNhMzk3MTViNmI1YTNmMDRmYWJjNzQ4ZGU2YWZhNzNhNzUifQ%3D%3D; io=VsJvZikGdc8spBaPAAMO',
'Host':'sshibikfdn.jin10.com:9084',
'Connection':'Upgrade',
'Origin':'https://datacenter.jin10.com',
'Pragma':'no-cache',
'Sec-WebSocket-Extensions':'permessage-deflate; client_max_window_bits',
'Sec-WebSocket-Key':'g4UA3smEJ0eGufMkyz7AOw==',
'Sec-WebSocket-Version':'13',
'Upgrade':'websocket',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
defget_web_socket():
start_message="2probe"
ws=create_connection(
api,
header=headers,
cookie=headers['Cookie'],
#origin=headers['Origin'],
#host=headers['Host']
)
frame=ABNF.create_frame("2probe",ABNF.OPCODE_TEXT)
ws.send_frame(frame)
data=ws.recv_frame()
print(data)
if__name__=='__main__':
get_web_socket()
在運行程序之后毫無效果,另外端口的api會根據(jù)真實的請求變化 ,而且進一步的請求的cookie和key都會變化 ,看來直連的方式是行不通了,那沒辦法,只能走渲染的路了,selenium ? 可以是可以,不過我們要嘗試一下新的路線和方法,那就直接上chrome-headless
實踐
Headless Chrome指在headless模式下運行谷歌瀏覽器(以程序模式運行,沒有界面),自從這玩意兒出來之后, phantomjs的作者就宣布甩鍋不維護了(人家也確實辛苦,沒啥收益),可以說是一個非常好的工具了,咱們說干就干
安裝
直接使用docker 來安裝chrome headless
docker run -d -p 9222:9222 --cap-add=SYS_ADMIN justinribeiro/chrome-headless
1
2
dockerrun-d-p9222:9222--cap-add=SYS_ADMINjustinribeiro/chrome-headless
環(huán)境
python 3.6
ubuntu16.04
pip install websocket-client
pip install requests
1
2
3
pipinstallwebsocket-client
pipinstallrequests
編碼
這樣我們已經(jīng)啟用了一個chrome headless的服務(wù),那如何使用呢,我們使用websocket 和chrome header less進行交互,不多說了,直接上代碼吧
import json
import time
import requests
import websocket
request_id = 0
target_url = 'https://datacenter.jin10.com/price'
def get_websocket_connection():
r = requests.get('http://10.10.2.42:9222/json') #這是開啟docker chrome headless的機器地址
if r.status_code != 200:
raise ValueError("can not get the api ,please check if docker is ready")
conn_api = r.json()[0].get('webSocketDebuggerUrl')
return websocket.create_connection(conn_api)
def run_command(conn, method, **kwargs):
global request_id
request_id += 1
command = {'method': method,
'id': request_id,
'params': kwargs}
conn.send(json.dumps(command))
#while True:
msg = json.loads(conn.recv())
if msg.get('id') == request_id:
return msg
def get_element():
conn = get_websocket_connection()
msg = run_command(conn, 'Page.navigate', url=target_url)
time.sleep(5)
js = "var p = document.querySelector('.jin-pricewall_list-item_b').innerText ; p ;"
for _ in range(20):
time.sleep(1)
msg = run_command(conn, 'Runtime.evaluate', expression=js)
print(msg.get('result')['result']['value'])
if __name__ == '__main__':
get_element()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
importjson
importtime
importrequests
importwebsocket
request_id=0
target_url='https://datacenter.jin10.com/price'
defget_websocket_connection():
r=requests.get('http://10.10.2.42:9222/json')#這是開啟docker chrome headless的機器地址
ifr.status_code!=200:
raiseValueError("can not get the api ,please check if docker is ready")
conn_api=r.json()[0].get('webSocketDebuggerUrl')
returnwebsocket.create_connection(conn_api)
defrun_command(conn,method,**kwargs):
globalrequest_id
request_id+=1
command={'method':method,
'id':request_id,
'params':kwargs}
conn.send(json.dumps(command))
#while True:
msg=json.loads(conn.recv())
ifmsg.get('id')==request_id:
returnmsg
defget_element():
conn=get_websocket_connection()
msg=run_command(conn,'Page.navigate',url=target_url)
time.sleep(5)
js="var p = document.querySelector('.jin-pricewall_list-item_b').innerText ; p ;"
for_inrange(20):
time.sleep(1)
msg=run_command(conn,'Runtime.evaluate',expression=js)
print(msg.get('result')['result']['value'])
if__name__=='__main__':
get_element()
整體邏輯非常簡單,打開指定頁面,等待頁面數(shù)據(jù)刷新,然后直接偷懶拿數(shù)據(jù)渲染之后的頁面值,運行效果如下:
其中的0 是因為頁面還在渲染之中,所以數(shù)據(jù)還沒有正式的出現(xiàn)在前臺界面上
總結(jié)
本次主要使用了chrome-headless的相關(guān)渲染環(huán)境來解決了我們抓取數(shù)據(jù)的問題,并且使用了websocket api 來進一步操作,其實google 官方有sdk進行操作,https://github.com/GoogleChrome/puppeteer ,渲染的終究不是高效的做法,但是對于這種單頁面目的性很強的數(shù)據(jù),可以嘗試渲染大法
總結(jié)
以上是生活随笔為你收集整理的python chrome headless_[技巧] chrome headless 爬虫抓取websoket 数据的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 口袋妖怪复刻白鬼配招怎么玩
- 下一篇: python定时器库_Python定时器