七城市房价统计
在鏈家網(wǎng)上爬取北京、上海、廣州、深圳、天津、青島的房價(jià),繪制熱力圖、箱線圖。
首先爬取房價(jià)信息。
from urllib import request from bs4 import BeautifulSoup as bs import re import pandas as pdtemp0 = [] temp0 = {"price": temp0} df = pd.DataFrame(temp0) #建立空dataframe用來后續(xù)存儲(chǔ)數(shù)據(jù)region = "hz" #地名簡寫 pages = 73 #鏈家網(wǎng)的房價(jià)頁數(shù)for i in range(1, pages+1):res = request.urlopen('https://' + region +'.fang.lianjia.com/loupan/pg' + str(i) + '/')html_data = res.read().decode('utf-8')Soup = bs(html_data, 'html.parser')print(i, "/", pages) #觀察進(jìn)度p1 = Soup.find_all('div', class_="main-price") # 找到價(jià)格標(biāo)簽,class后面要加下劃線price_str = str(p1)pattern = re.compile(r'.+\s.+元') #有些房子是按套賣,排除這些房子,只找按平米賣的房子temp = re.findall(pattern, price_str)temp2 = "".join(temp) #將list轉(zhuǎn)為strpattern2 = re.compile(r'\d{5,6}') #找出房價(jià)數(shù)據(jù)price = re.findall(pattern2, temp2)dic_price = {"price": price}df_temp = pd.DataFrame(dic_price)df = pd.concat([df, df_temp], axis=0, ignore_index=True) #按行合并兩個(gè)dataframeregion = region.upper() file_name = region + "HousePrice.csv" df.to_csv(file_name)熱力圖使用folium包繪制使用pip安裝folium
pip3 install folium導(dǎo)入需要的包
import pandas as pd import numpy as np import folium import webbrowser from folium.plugins import HeatMap import matplotlib.pyplot as plt整理爬取的房價(jià)數(shù)據(jù),并平均值,畫出箱線圖。
df.columns = (["Beijing", "Shanghai", "Guangzhou", "Shenzhen", "Tianjin", "Qingdao", "Hangzhou"]) df.boxplot() plt.show()df.columns = (["北京市", "上海市", "廣州市", "深圳市", "天津市", "青島市", "杭州市"]) #設(shè)置列名 mean_price = df.mean() #求每列平均值 mean_price = pd.DataFrame(mean_price, columns=["price"]) #將求出的平均值存為dataframe并設(shè)置列名
下載中國城市的經(jīng)緯度信息點(diǎn)擊打開鏈接,保存為csv。
cities = pd.read_csv("Cities.csv", sep=",") cities = cities[["cities", "lon", "lat"]] cities.set_index("cities", inplace=True) #修改index方便后面檢索將城市經(jīng)緯度信息與房價(jià)均值放在一個(gè)dataframe中。
my_cities = pd.DataFrame(columns=["lon", "lat"]) #新建空dataframe保存城市經(jīng)緯度和房價(jià)數(shù)據(jù) for i in mean_price.index:new_position = []lat = cities["lat"][i] #獲取緯度值lon = cities["lon"][i] #獲取精度值new_position.append(lat)new_position.append(lon)my_cities.loc[i] = new_positionmy_cities = pd.concat([my_cities, mean_price], axis=1) #將經(jīng)緯度與房價(jià)均價(jià)合并在一個(gè)dataframe my_cities.to_csv("mean_price.csv", encoding="utf_8_sig") #加上encoding="utf_8_sig"解決中文亂碼問題繪制熱力圖
lst = np.array(my_cities).tolist() #轉(zhuǎn)化dataframe為list,為后面畫熱力圖用,先轉(zhuǎn)成ndarray,在用tolist() map_osm = folium.Map(location=[35,110],zoom_start=5) #繪制Map,開始縮放程度是5倍 HeatMap(lst).add_to(map_osm) # 將熱力圖添加到前面建立的map里file_path = r"house_price.html" map_osm.save(file_path) # 保存為html文件webbrowser.open(file_path) # 默認(rèn)瀏覽器打開總結(jié)