日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 综合教程 >内容正文

综合教程

拉钩网数据抓取

發布時間:2023/12/15 综合教程 34 生活家
生活随笔 收集整理的這篇文章主要介紹了 拉钩网数据抓取 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
import json
import re
import time

import requests
import multiprocessing


class HandleLaGou():
    def __init__(self):
        # 使用session保存cookies信息
        self.lagou_session = requests.Session()
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
        }
        self.city_list = ""

    # 獲取全國所有城市列表的方法
    def handle_city(self):
        city_search = re.compile(r'zhaopin/">(.*?)</a>')
        city_url = 'https://www.lagou.com/jobs/allCity.html'
        city_result = self.handle_request(method="GET", url=city_url)
        # 使用正則表達式獲取城市列表
        self.city_list = city_search.findall(city_result)
        self.lagou_session.cookies.clear()  # 清除cookies
        # print(city_result)

    def handle_city_job(self, city):
        first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
        first_response = self.handle_request(method="GET", url=first_request_url)
        total_page_search = re.compile(r'class="spanstotalNum">(d+)</span>')
        try:
            total_page = total_page_search.search(first_response).group(1)
        # 由于沒有崗位信息造成exception
        except:
            return
        else:
            for i in range(1, int(total_page) + 1):
                data = {
                    "pn":i,
                    "kd":"python"
                }
                page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"% city
                referer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s'%city
                #referer_url需要進行encode
                self.header['Referer'] = referer_url.encode()
                response = self.handle_request("POST",page_url,data=data,info=city)
                lagou_data = json.loads(response)
                job_list = lagou_data['content']['positionResult']['result']
                for job in job_list:
                    print(job)
        print(total_page)

    def handle_request(self, method, url, data=None, info=None):
        while True:
            #加入阿布云代理
            proxyinfo = "http://%s:%s@%s:%s"%('阿布云賬號','阿布云密碼','阿布云host','阿布云port')
            proxy = {
                "http":proxyinfo,
                "https":proxyinfo
            }
            try:
                if method == "GET":
                    response = self.lagou_session.get(url=url, headers=self.header,proxies=proxy,timeout=6)
                elif method =="POST":
                    response = self.lagou_session.post(url=url, headers=self.header,data=data,proxies=proxy,timeout=6)
            except:
                # 需先清除cookies信息,然后重新獲取
                self.lagou_session.cookies.clear()
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_request(method="GET", url=first_request_url)
                time.sleep(10)
                continue
            response.encoding = 'utf-8'
            if '頻繁' in response.text:
                print("頻繁")
                #需先清除cookies信息,然后重新獲取
                self.lagou_session.cookies.clear()
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_request(method="GET", url=first_request_url)
                time.sleep(10)
                continue
            return response.text


if __name__ == '__main__':
    lagou = HandleLaGou()
    # 所有城市方法
    lagou.handle_city()
    #引入多進程,加速抓取
    pool = multiprocessing.Pool(2)
    
    for city in lagou.city_list:
        pool.apply_async(lagou.handle_city_job,args=(city,))
        
    pool.close()
    pool.join()

總結

以上是生活随笔為你收集整理的拉钩网数据抓取的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。