python apply_async数据量大不执行_apply_async里面的函数不执行
源自:2-3 加入代理邏輯隱藏爬蟲 使用多進程加速抓取
apply_async里面的函數不執行
#?coding=utf-8
import?json
import?re
import?time
import?multiprocessing
import?requests
class?HandleLaGou(object):
def?__init__(self):
self.lagou_session?=?requests.session()
self.header?=?{
'User-Agent':?'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_15_0)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/78.0.3904.108?Safari/537.36'
}
self.city_list?=?""
#?獲取全國所有城市列表
def?handle_city(self):
city_search?=?re.compile(r'zhaopin/">(.*?)')
city_url?=?'https://www.lagou.com/jobs/allCity.html'
city_result?=?self.handle_request(method='GET',?url=city_url)
#?使用正則表達式獲取城市列表
self.city_list?=?city_search.findall(city_result)
self.lagou_session.cookies.clear()
def?handle_city_job(self,?city):
first_request_url?=?'https://www.lagou.com/jobs/list_python?&px=default&city=%s'?%?city
first_response?=?self.handle_request(method='GET',?url=first_request_url)
total_page_search?=?re.compile(r'class="span\stotalNum">(\d+)')
try:
total_page?=?total_page_search.search(first_response).group(1)
except:
return
else:
for?i?in?range(1,?int(total_page)?+?1):
data?=?{
'pn':?i,
'kd':?'python'
}
page_url?=?'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false'?%?city
referer_url?=?'https://www.lagou.com/jobs/list_python?&px=default&city=%s'?%?city
self.header['Referer']?=?referer_url.encode()
response?=?self.handle_request(method='POST',?url=page_url,?data=data,?info=city)
print?response
lagou_data?=?json.loads(response)
job_list?=?lagou_data['content']['positionResult']['result']
for?job?in?job_list:
print?job
def?handle_request(self,?method,?url,?data=None,?info=None):
global?response
while?True:
#?阿布云代理
#?代理服務器
proxyHost?=?"http-dyn.abuyun.com"
proxyPort?=?"9020"
#?代理隧道驗證信息
proxyUser?=?"H6451437A9W24E7D"
proxyPass?=?"A86CD1F6AF3AD760"
proxyMeta?=?"http://%(user)s:%(pass)s@%(host)s:%(port)s"?%?{
"host":?proxyHost,
"port":?proxyPort,
"user":?proxyUser,
"pass":?proxyPass,
}
proxies?=?{
"http":?proxyMeta,
"https":?proxyMeta,
}
try:
if?method?==?'GET':
response?=?self.lagou_session.get(
url=url,
headers=self.header,
proxies=proxies,
timeout=6
)
elif?method?==?'POST':
response?=?self.lagou_session.post(
url=url,
headers=self.header,
data=data,
proxies=proxies,
timeout=6
)
except:
self.lagou_session.cookies.clear()
first_request_url?=?'https://www.lagou.com/jobs/list_python?&px=default&city=%s'?%?info
self.handle_request(method='GET',?url=first_request_url)
time.sleep(10)
continue
response.encoding?=?'utf-8'
if?'頻繁'?in?response.text:
print?response.text
self.lagou_session.cookies.clear()
first_request_url?=?'https://www.lagou.com/jobs/list_python?&px=default&city=%s'?%?info
self.handle_request(method='GET',?url=first_request_url)
time.sleep(10)
continue
return?response.text
if?__name__?==?'__main__':
lagou?=?HandleLaGou()
lagou.handle_city()
#?引入多進程
pool?=?multiprocessing.Pool(1)
for?city?in?lagou.city_list:
pool.apply_async(lagou.handle_city_job,?args=(city,1))
pool.close()
pool.join()
執行結果/usr/local/bin/python2.7?/Users/imooc_lagou/handle_crawl_lagou.pyProcess
finished?with?exit?code?0
提問者:Micksun
2019-12-13 00:14
總結
以上是生活随笔為你收集整理的python apply_async数据量大不执行_apply_async里面的函数不执行的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python中访问命令行参数_如何在Py
- 下一篇: python 列表中的数字转字符串_py