生活随笔
收集整理的這篇文章主要介紹了
Python电影推荐系统
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
Python實現基于皮爾森系數的協同過濾電影推薦。
爬蟲獲取用戶數據
"""
爬取豆瓣某影視的評分前100個用戶,將他們的影評信息抓取下來作為movie.json
為了保證數據的可靠性,選擇豆瓣電影top250 No.1的【肖申克的救贖】,熱門影評的前100人作為數據
"""from urllib
.request
import urlopen
from bs4
import BeautifulSoup
import re
import json
import urllib
import requestspeople_names
= []
people_urls
= []
r
= re
.compile(r
'e/(.+)/')
headers
= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/74.0.3724.8 Safari/537.36','Referer': 'https://movie.douban.com/subject/26100958/comments','Connection': 'keep-alive'}print("爬取用戶中 ...")
for i
in range(0, 10):url
= ("https://movie.douban.com/subject/27010768/comments?""start=" + str(i
* 20) + "&limit=20&sort=new_score&status=P&percent_type=")req
= urllib
.request
.Request
(url
=url
, headers
=headers
)data
= urllib
.request
.urlopen
(req
).read
().decode
('utf-8')bs
= BeautifulSoup
(data
, 'html.parser')comments
= bs
.findAll
("div", {"class": "comment"})for comment
in comments
:people_url
= comment
.findAll
("a")[1].attrs
["href"].replace
("www", "movie")name
= re
.findall
(r
, people_url
)[0]people_names
.append
(name
)people_urls
.append
(people_url
)print("爬取用戶完成")final_data
= {}
for i
in range(0, len(people_names
)):final_data
.setdefault
(people_names
[i
], {})final_data
[people_names
[i
]]["people_url"] = people_urls
[i
]print("爬取用戶影評中...")user_count
= 1
for people_name
in final_data
:print("正在爬取第" + str(user_count
) + "位用戶" + people_name
+ "的影評信息")user_count
+= 1for i
in range(0, 6):comment_url_suffix
= ("collect?start=" + str(i
* 15) + "&sort=time&rating=all""&filter=all&mode=grid")comment_url
= final_data
[people_name
]["people_url"] + comment_url_suffixreq
= urllib
.request
.Request
(url
=comment_url
, headers
=headers
)comment_data
= urllib
.request
.urlopen
(req
).read
().decode
('utf-8')bs
= BeautifulSoup
(comment_data
, 'html.parser')infos
= bs
.find
("div", {"class": "grid-view"}).findAll
("div", {"class": "info"})for info
in infos
:movie_name
= info
.em
.get_text
() try:movie_rate
= re
.search
("[0-9]", info
.findAll
("li")[2].span
.attrs
["class"][0]).group
()except:continuetry:movie_comment
= info
.find
("span", {"class": "comment"}).get_text
()except:movie_comment
= ""final_data
[people_name
].setdefault
("movies", {})final_data
[people_name
]["movies"].setdefault
(movie_name
, {})final_data
[people_name
]["movies"][movie_name
]["movie_rate"] = movie_ratefinal_data
[people_name
]["movies"][movie_name
]["movie_comment"] = movie_comment
print("爬取用戶影評完成")file = open('movie_data.json', 'w', encoding
='utf-8')
json
.dump
(final_data
, file, ensure_ascii
=False)
file.close
()
file = open('movie_data.json', 'r', encoding
='utf-8')
s
= json
.load
(file)
file.close
()
爬蟲獲取待推薦用戶數據(默認自己):
"""
獲取本人豆瓣影評信息,通過此信息分析個人喜好,尋找與我品味相似的用戶
最后將本人的喜好也放入json文件中
"""import json
from urllib
.request
import urlopen
from bs4
import BeautifulSoup
import re
import urllibheaders
= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/74.0.3724.8 Safari/537.36','Referer': 'https://movie.douban.com/subject/26100958/comments','Connection': 'keep-alive'}
file = open('movie_data.json', 'r', encoding
='utf-8')
movie_data
= json
.load
(file)
file.close
()
people_name
= "204331023"
url
= "https://movie.douban.com/people/"+people_name
+"/"
movie_data
.setdefault
(people_name
, {})
movie_data
[people_name
]["people_url"] = url
for i
in range(0, 6):comment_url_suffix
= ("collect?start="+str(i
*15)+"&sort=time&rating=all""&filter=all&mode=grid")comment_url
= movie_data
[people_name
]["people_url"]+comment_url_suffixreq
= urllib
.request
.Request
(url
=comment_url
, headers
=headers
)comment_data
= urllib
.request
.urlopen
(req
).read
().decode
('utf-8')bs
= BeautifulSoup
(comment_data
, 'html.parser')infos
= bs
.find
("div", {"class": "grid-view"}).findAll
("div", {"class": "info"})for info
in infos
:movie_name
= info
.em
.get_text
()try:movie_rate
= re
.search
("[0-9]", info
.findAll
("li")[2].span
.attrs
["class"][0]).group
()except:continuetry:movie_comment
= info
.find
("span", {"class": "comment"}).get_text
()except:movie_comment
= ""movie_data
[people_name
].setdefault
("movies", {})movie_data
[people_name
]["movies"].setdefault
(movie_name
, {})movie_data
[people_name
]["movies"][movie_name
]["movie_rate"] = movie_ratemovie_data
[people_name
]["movies"][movie_name
]["movie_comment"] = movie_comment
file = open('movie_data.json', 'w', encoding
='utf-8')
json
.dump
(movie_data
, file, ensure_ascii
=False)
file.close
()
實現電影推薦
"""
根據皮爾森系數,找出與我相似的用戶,再找這些用戶最喜歡的電影
推薦20部我可能喜歡的電影
"""import json
from math
import sqrt
file = open('movie_data1.json', 'r', encoding
='utf-8')
movie_data
= json
.load
(file)
file.close
()
my_name
= "204331023"
def sim_pearson(data
, p1
, p2
):"""計算皮爾森相似度:param data: 爬取的用戶影評數據:param p1: 用戶1:param p2: 用戶2:return: 返回相似度"""si
= {}for item
in data
[p1
]["movies"]:if item
in data
[p2
]["movies"]:si
[item
] = 1if len(si
) == 0:return 0n
= len(si
)sum1
= sum([int(data
[p1
]["movies"][it
]["movie_rate"]) for it
in si
])sum2
= sum([int(data
[p2
]["movies"][it
]["movie_rate"]) for it
in si
])sim1_sq
= sum([pow(int(data
[p1
]["movies"][it
]["movie_rate"]), 2) for it
in si
])sim2_sq
= sum([pow(int(data
[p2
]["movies"][it
]["movie_rate"]), 2) for it
in si
])p_sum
= sum([int(data
[p1
]["movies"][it
]["movie_rate"]) * int(data
[p2
]["movies"][it
]["movie_rate"]) for it
in si
])num
= p_sum
- (sum1
* sum2
/ n
)den
= sqrt
((sim1_sq
- pow(sum1
, 2) / n
) * (sim2_sq
- pow(sum2
, 2) / n
))if den
== 0:return 0r
= num
/ den
return r
def top_matches(data
, person
, similarity
=sim_pearson
):"""找到5個相似度最高的用戶:param data: 爬取的數據:param person: 用戶本人:param n: 前n個最相似的用戶:param similarity: 皮爾森相關系數:return: """sorted_data
= {person
: data
[person
]}min_sim
= 0.5for other
in data
:if other
== person
:continueif similarity
(data
, person
, other
) >= min_sim
:sorted_data
[other
] = data
[other
]print(other
, sorted_data
[other
])return sorted_data
def get_recommendations(data1
, person
, n
=5, similarity
=sim_pearson
):"""獲取推薦結果:param data: 電影評分數據:param person: 待推薦用戶名稱:param n: 推薦條目:param similarity: 皮爾森相似度:return: 返回電影數據"""totals
= {}sim_sum
= {}data
= top_matches
(data1
, person
)for other
in data
:if other
== person
: continuesim
= similarity
(data
, person
, other
)print(sim
)if sim
<= 0:continuefor item
in data
[other
]["movies"]:if item
not in data
[person
]["movies"] or data
[person
]["movies"][item
] == 0:totals
.setdefault
(item
, 0)totals
[item
] += int(data
[other
]["movies"][item
]["movie_rate"]) * simsim_sum
.setdefault
(item
, 0)sim_sum
[item
] += simrankings
= [(total
/ sim_sum
[item
], item
) for item
, total
in totals
.items
()]rankings
.sort
()rankings
.reverse
()return rankings
[0:n
]if __name__
== '__main__':for res
in get_recommendations
(movie_data
, my_name
, n
=5):print(res
)
使用過程中,逐一運行三個文件即可。
運行結果:
參考: https://blog.csdn.net/XYYxyy55/article/details/80487007
總結
以上是生活随笔為你收集整理的Python电影推荐系统的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。