python抓取数据库_Python-7.爬取大量数据存入数据库
本章包含內容:
前言
mongodb環(huán)境配置
爬取數(shù)據(jù)的代碼分析
一、前言
在更新完上一篇python文章時,就一直想爬取一個10萬量級的數(shù)據(jù)。在解了mongodb的基本用法和環(huán)境配置后,第一次測試的是安居客的二手房,遇到了很多小問題,最終沒能繼續(xù)下去。今天這次測試的是趕集網(wǎng)的跳蚤市場商品,在經(jīng)過幾次調試,最終程序得以正常運行。可惜的是趕集網(wǎng)跳蚤市場數(shù)據(jù)總數(shù)也才4萬多條,沒有達到目標的10萬條數(shù)據(jù)。但麻雀雖小,五臟俱全,雖然數(shù)據(jù)量沒有達到,但最終爬取的原理卻是一樣的,下面將一一分析本次爬取的代碼。
二、mongodb環(huán)境配置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
三、爬取數(shù)據(jù)的代碼分析
1.首先爬取跳蚤市場各個分類的入口鏈接
from bs4 import BeautifulSoup
import requests
url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
mainUrlStrs = soup.select('.fenlei > dt > a')
for mainUrlStr in mainUrlStrs:
#拼接
print(url_host + mainUrlStr.get('href'))
#輸出:
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
在經(jīng)過篩選,剔除不符合需求的入口鏈接,比如手機號碼之類的入口,這就是我們需要的入口:
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
2.根據(jù)入口鏈接爬取商品信息,同時寫入數(shù)據(jù)庫(核心代碼)
from bs4 import BeautifulSoup
import requests
import random
import pymongo
#連接mongoDB數(shù)據(jù)庫
#參數(shù)localhost:表示在本地數(shù)據(jù)庫
#參數(shù)27017:端口,表示指向哪
client = pymongo.MongoClient('localhost',27017)
#創(chuàng)建數(shù)據(jù)庫名稱
ganjiwang = client['ganjiwang']
#創(chuàng)建數(shù)據(jù)表
list_info = ganjiwang['list_info']
#趕集網(wǎng)headers信息
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Connection':'keep-alive'
}
#隨機IP,因為同一個IP頻繁請求,服務器將會視其為黑客攻擊,所有我們會隨機分配IP來請求
proxy_list = [
'http://125.88.74.122:85',
'http://125.88.74.122:83',
'http://171.8.79.143:8080',
'http://14.152.93.79:8080',
]
proxy_ip = random.choice(proxy_list) # 隨機獲取代理ip
proxies = {'http': proxy_ip}
#爬取網(wǎng)頁信息的核心代碼,該函數(shù)可以寫的有點笨,可再次做優(yōu)化,這里只為實現(xiàn)需求,不再優(yōu)化
def get_list_info(url,data=None):
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
# print(soup)
#當是該類商品最后一頁是,即沒有更多商品時,我們pass掉
if soup.find('div','no-search') or soup.find('div','noinfo'):
print('已經(jīng)到最后一頁')
pass
elif url.find('http://bj.ganji.com/bangong/') == 0:
#二手設備
print('二手設備')
titles = soup.select('.js-item > a')
imgs = soup.select('.list-bigpic > dt > a > img')
prices = soup.select('.pt-price > span')
locations = soup.select('.feature > span')
for title,img,price,location in zip(titles,imgs,prices,locations):
print(title.text)
print(title.get('href'))
print(img.get('data-original')) if img.get('data-original') else print(img.get('src'))
print(price.text)
print(location.text) if len(location.text) != 0 else print('無')
infoTitle = title.text
infoDetailUrl = title.get('href')
infoImgUrl = img.get('data-original') if img.get('data-original') else img.get('src')
infoPrice = price.text
infoDetailText = '無'
infoLocation = location.text if len(location.text) != 0 else '無'
#寫入數(shù)據(jù)庫
list_info.insert_one({'title':infoTitle,
'detailUrl':infoDetailUrl,
'img':infoImgUrl,
'price':infoPrice,
'detailText':infoDetailText,
'location':infoLocation})
elif url.find('http://bj.ganji.com/nongyongpin/') == 0:
#二手農(nóng)用品
# print('二手農(nóng)用品')
titles = soup.select('.js-item > a')
imgs = soup.select('.list-bigpic > dt > a > img')
prices = soup.select('.pt-price > span')
locations = soup.select('.list-word > a')
for title,img,price,location in zip(titles,imgs,prices,locations):
print(title.text)
print(title.get('href'))
print(img.get('data-original')) if img.get('data-original') else print(img.get('src'))
print(price.text)
print(location.text) if len(location.text) != 0 else print('無')
infoTitle = title.text
infoDetailUrl = title.get('href')
infoImgUrl = img.get('data-original') if img.get('data-original') else img.get('src')
infoPrice = price.text
infoDetailText = '無'
infoLocation = location.text if len(location.text) != 0 else '無'
list_info.insert_one({'title':infoTitle,
'detailUrl':infoDetailUrl,
'img':infoImgUrl,
'price':infoPrice,
'detailText':infoDetailText,
'location':infoLocation})
elif url.find('http://bj.ganji.com/xianzhilipin/') == 0:
#二手閑置禮品
print('二手閑置禮品')
titles = soup.select('.js-item > a')
imgs = soup.select('.list-bigpic > dt > a > img')
prices = soup.select('.pt-price > span')
details = soup.select('.feature > p')
locations = soup.select('.list-word > a')
for title,img,price,detail,location in zip(titles,imgs,prices,details,locations):
print(title.text)
print(title.get('href'))
print(img.get('data-original')) if img.get('data-original') else print(img.get('src'))
print(price.text)
print(detail.text)
print(location.text)
infoTitle = title.text
infoDetailUrl = title.get('href')
infoImgUrl = img.get('data-original') if img.get('data-original') else img.get('src')
infoPrice = price.text
infoDetailText = detail.text
infoLocation = location.text if len(location.text) != 0 else '無'
list_info.insert_one({'title':infoTitle,
'detailUrl':infoDetailUrl,
'img':infoImgUrl,
'price':infoPrice,
'detailText':infoDetailText,
'location':infoLocation})
elif url.find('http://bj.ganji.com/xuniwupin/') == 0:
#二手虛擬物品
print('二手虛擬物品')
titles = soup.select('.js-item > a')
imgs = soup.select('.list-bigpic > dt > a > img')
prices = soup.select('.pt-price > span')
details = soup.select('.feature > p')
locations = soup.select('.list-word > a')
for title,img,price,detail,location in zip(titles,imgs,prices,details,locations):
print(title.text)
print(title.get('href'))
print(img.get('data-original')) if img.get('data-original') else print(img.get('src'))
print(price.text)
print(detail.text)
print(location.text)
infoTitle = title.text
infoDetailUrl = title.get('href')
infoImgUrl = img.get('data-original') if img.get('data-original') else img.get('src')
infoPrice = price.text
infoDetailText = detail.text
infoLocation = location.text if len(location.text) != 0 else '無'
list_info.insert_one({'title':infoTitle,
'detailUrl':infoDetailUrl,
'img':infoImgUrl,
'price':infoPrice,
'detailText':infoDetailText,
'location':infoLocation})
else:
#非二手設備、二手農(nóng)用品
titles = soup.select('.t > a')
imgs = soup.select('.js-lazy-load')
prices = soup.select('.pricebiao > span')
details = soup.select('.desc')
locations = soup.select('#infolist > div.infocon > table > tbody > tr > td.t > span.fl')
for title,img,price,detail,location in zip(titles,imgs,prices,details,locations):
print(title.text)
print(title.get('href'))
print(img.get('data-original')) if img.get('data-original') else print(img.get('src'))
print(price.text)
print(detail.text)
print(location.text) if len(location.text) != 0 else print('無')
infoTitle = title.text
infoDetailUrl = title.get('href')
infoImgUrl = img.get('data-original') if img.get('data-original') else img.get('src')
infoPrice = price.text
infoDetailText = detail.text
infoLocation = location.text if len(location.text) != 0 else '無'
list_info.insert_one({'title':infoTitle,
'detailUrl':infoDetailUrl,
'img':infoImgUrl,
'price':infoPrice,
'detailText':infoDetailText,
'location':infoLocation})
3.因每個鏈接入口都有不止一頁,所以我們還需根據(jù)頁數(shù)來爬去每一頁的數(shù)據(jù),另外就是利用CPU的多線程能力執(zhí)行爬去代碼,這樣能高效的爬去數(shù)據(jù)
from multiprocessing import Pool
from pageList import get_list_info
from mainUrl import urlStr
#因為在觀察趕集網(wǎng)的鏈接是我們發(fā)現(xiàn)o1..o2..是對應頁面的頁碼,所有這里拼接每個頁面的鏈接,這里以最多100頁為測試
def get_all_list_info(url):
for p in range(1,100):
get_list_info(url + 'o' + str(p))
if __name__ == '__main__':#需加上這句代碼,這時是一種固定的寫法,作用是這句代碼會把他上下分開兩部分,避免我們改變地址時的名字混亂
# 創(chuàng)建一個進程池,所有我們設計的爬蟲,都會被放到進程池內,然后自動分配系統(tǒng)資源來執(zhí)行
# pool()有一個參數(shù),processes,表示有多少個進程,比如processes=2
pool = Pool()
# 從所有頻道列表中得到鏈接,
# map()函數(shù):會把后面的集合一個一個的放到第一個函數(shù)中執(zhí)行
# 參數(shù)1:一個函數(shù)
# 參數(shù)2:一個集合
pool.map(get_all_list_info,urlStr.split())
pool.close()
pool.join()
4.最后,我們實現(xiàn)一個每10秒查詢一下數(shù)據(jù)庫數(shù)據(jù)條數(shù)的程序,實時觀看爬取進度
from pageList import list_info
import time
#每10秒查詢一次數(shù)據(jù)庫數(shù)據(jù)總數(shù)
while True:
print('數(shù)據(jù)庫已寫入--{}--條數(shù)據(jù)'.format(list_info.find().count()))
time.sleep(10)
至此,我們的爬蟲程序已經(jīng)完成了,上述每句關鍵代碼都給出了詳細的注釋,應該不難理解。下面是執(zhí)行程序的效果:
10W.gif
程序結束時:
end.png
總結
以上是生活随笔為你收集整理的python抓取数据库_Python-7.爬取大量数据存入数据库的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 墨剑江湖神龙定海任务
- 下一篇: python路由编程_Python Dj