日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

当当网图书爬虫与数据分析

發布時間:2023/12/31 编程问答 39 豆豆
生活随笔 收集整理的這篇文章主要介紹了 当当网图书爬虫与数据分析 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

文章目錄

  • 爬蟲篇
  • 繪制圖書圖片墻
  • 數據分析篇

爬蟲篇

''' Function:當當網圖書爬蟲 ''' import time import pickle import random import requests from bs4 import BeautifulSoupheaders = {'Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Cache-Control': 'no-cache','Connection': 'keep-alive','Host': 'search.dangdang.com' }'''解析, 提取需要的數據''' def parseHtml(html):data = {}soup = BeautifulSoup(html, 'lxml')conshoplist = soup.find_all('div', {'class': 'con shoplist'})[0]for each in conshoplist.find_all('li'):# 書名bookname = each.find_all('a')[0].get('title').strip(' ')# 書圖img_src = each.find_all('a')[0].img.get('data-original')if img_src is None:img_src = each.find_all('a')[0].img.get('src')img_src = img_src.strip(' ')# 價格price = float(each.find_all('p', {'class': 'price'})[0].span.text[1:])# 簡介detail = each.find_all('p', {'class': 'detail'})[0].text# 評分stars = float(each.find_all('p', {'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20# 評論數量num_comments = float(each.find_all('p', {'class': 'search_star_line'})[0].a.text[:-3])data[bookname] = [img_src, price, detail, stars, num_comments]return data'''主函數''' def main(keyword):url = 'http://search.dangdang.com/?key={}&act=input&page_index={}'results = {}num_page = 0while True:num_page += 1print('[INFO]: Start to get the data of page%d...' % num_page)page_url = url.format(keyword, num_page)res = requests.get(page_url, headers=headers)if '抱歉,沒有找到與“%s”相關的商品,建議適當減少篩選條件' % keyword in res.text:breakpage_data = parseHtml(res.text)results.update(page_data)time.sleep(random.random() + 0.5)with open('%s_%d.pkl' % (keyword, num_page-1), 'wb') as f:pickle.dump(results, f)return resultsif __name__ == '__main__':main('python')

繪制圖書圖片墻

思路:
1)先利用爬取當當網圖書的圖片ur
2)批量爬取圖片
3)繪制圖片墻

import os import time import math import pickle import requests from PIL import ImagePICDIR = 'pictures' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', }'''圖片下載''' def downloadPics(urls, savedir):if not os.path.exists(savedir):os.mkdir(savedir)for idx, url in enumerate(urls):res = requests.get(url, headers=headers)with open(os.path.join(savedir, '%d.jpg' % idx), 'wb') as f:f.write(res.content)time.sleep(0.5)'''制作照片墻''' def makePicturesWall(picdir):picslist = os.listdir(picdir)num_pics = len(picslist)print('照片數量',num_pics)size = 64line_numpics = int(math.sqrt(num_pics))#正方形picwall = Image.new('RGBA', (line_numpics*size, line_numpics*size))x = 0y = 0for pic in picslist:img = Image.open(os.path.join(picdir, pic))img = img.resize((size, size), Image.ANTIALIAS) #改變圖片尺寸picwall.paste(img, (x*size, y*size)) #合并圖片x += 1if x == line_numpics:x = 0y += 1print('[INFO]: Generate pictures wall successfully...')picwall.save("picwall.png") #保存圖片if __name__ == '__main__':with open('python_61.pkl', 'rb') as f:data = pickle.load(f)urls = [j[0] for i, j in data.items()] #加載圖片下載 url# downloadPics(urls, PICDIR)makePicturesWall(PICDIR)

圖片墻:

數據分析篇

''' import os import jieba import pickle from pyecharts import Bar from pyecharts import Pie from pyecharts import Funnel from wordcloud import WordCloud'''柱狀圖(2)''' def drawBar(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)bar = Bar(title, title_pos='center')#bar.use_theme('vintage')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]bar.add('', attrs, values, xaxis_rotate=15, yaxis_rotate=30)bar.render(os.path.join(savepath, '%s.html' % title))'''餅圖''' def drawPie(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)pie = Pie(title, title_pos='center')#pie.use_theme('westeros')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]pie.add('', attrs, values, is_label_show=True,legend_orient="vertical", #標簽成列legend_pos="left",# #標簽在左radius=[30, 75],rosetype="area" #寬度屬性隨值大小變化)pie.render(os.path.join(savepath, '%s.html' % title))'''漏斗圖''' def drawFunnel(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)funnel = Funnel(title, title_pos='center')#funnel.use_theme('chalk')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]funnel.add("", attrs, values, is_label_show=True,label_pos="inside",#顯示標簽在圖像中label_text_color="#fff",funnel_gap=5,legend_pos="left",legend_orient="vertical" #標簽成列)funnel.render(os.path.join(savepath, '%s.html' % title))'''統計詞頻''' def statistics(texts, stopwords):words_dict = {}for text in texts:temp = jieba.cut(text)for t in temp:if t in stopwords or t == 'unknow':continueif t in words_dict.keys():words_dict[t] += 1else:words_dict[t] = 1return words_dict'''詞云''' def drawWordCloud(words, title, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)wc = WordCloud( background_color='white', max_words=2000, width=1920, height=1080, margin=5)wc.generate_from_frequencies(words)wc.to_file(os.path.join(savepath, title+'.png'))if __name__ == '__main__':with open('python_61.pkl', 'rb') as f:data = pickle.load(f)# 價格分布results = {}prices = []price_max = ['', 0]for key, value in data.items():price = value[1]if price_max[1] < price:price_max = [key, price]prices.append(price)results['小于50元'] = sum(i < 50 for i in prices)results['50-100元'] = sum((i < 100 and i >= 50) for i in prices)results['100-200元'] = sum((i < 200 and i >= 100) for i in prices)results['200-300元'] = sum((i < 300 and i >= 200) for i in prices)results['300-400元'] = sum((i < 400 and i >= 300) for i in prices)results['400元以上'] = sum(i >= 400 for i in prices)drawPie('python相關圖書的價格分布', results)print('價格最高的圖書為: %s, 目前單價為: %f' % (price_max[0], price_max[1]))# 評分分布results = {}stars = []for key, value in data.items():star = value[3] if value[3] > 0 else '暫無評分'stars.append(str(star))for each in sorted(set(stars)):results[each] = stars.count(each)drawBar('python相關圖書評分分布', results)# 評論數量results = {}comments_num = []top6 = {}for key, value in data.items():num = int(value[-1])comments_num.append(num)top6[key.split('【')[0].split('(')[0].split('(')[0].split(' ')[0].split(':')[0]] = numresults['0評論'] = sum(i == 0 for i in comments_num)results['0-100評論'] = sum((i > 0 and i <= 100) for i in comments_num)results['100-1000評論'] = sum((i > 100 and i <= 1000) for i in comments_num)results['1000-5000評論'] = sum((i > 1000 and i <= 5000) for i in comments_num)results['5000評論以上'] = sum(i > 5000 for i in comments_num)drawFunnel('python相關圖書評論數量分布', results)top6 = dict(sorted(top6.items(), key=lambda item: item[1])[-6:])drawBar('python相關圖書評論數量TOP6', top6)# 詞云stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]texts = [j[2] for i, j in data.items()]words_dict = statistics(texts, stopwords)drawWordCloud(words_dict, 'python相關圖書簡介詞云', savepath='./results')

圖片展示:




評論詞云:

全部代碼與數據放在Github上:
https://github.com/why19970628/Python_Crawler/tree/master/DangDang_Books

總結

以上是生活随笔為你收集整理的当当网图书爬虫与数据分析的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。