【知识发现】基于物品的协同过滤推荐算法python实现
基于物品的協同過濾算法(Item-Based Collaborative Filtering)是目前業界應用最多的算法,亞馬遜、Netflix、Hulu、YouTube都采用該算法作為其基礎推薦算法。
基于用戶的協同過濾算法有一些缺點:隨著網站的用戶數目越來越大,計算用戶興趣相似度矩陣將越來越困難,其運算時間復雜度和空間復雜度的增長和用戶數的增長近似平方關心。并且,基于用戶的協同過濾算法很難對推薦結果做出解釋。因此亞馬遜提出了基于物品的協同過濾算法。
基于物品的協同過濾算法給用戶推薦那些和他們之前喜歡的物品相似的物品。不過ItemCF算法并不利用物品的內容屬性計算物品之間的相似度,它主要通過分析用戶的行為記錄計算用戶之間的相似度,也就是說物品A和物品B具有很大的相似度是因為喜歡物品A的用戶大都也喜歡物品B(這一點也是基于物品的協同過濾算法和基于內容的推薦算法最主要的區別)。同時,基于物品的協同過濾算法可以利用用戶的歷史行為給推薦結果提供推薦解釋,用于解釋的物品都是用戶之前喜歡的或者購買的物品。
ItemCF的公式可參考網上。
1、數據集:https://grouplens.org/datasets/movielens/ ?下載ratings.csv
? ? 格式如下:
? ??
| userId | movieId | rating |
| 1 | 31 | 2.5 |
| 1 | 1029 | 3 |
| 1 | 1061 | 3 |
| 1 | 1129 | 2 |
| 1 | 1172 | 4 |
| 1 | 1263 | 2 |
| 1 | 1287 | 2 |
| 1 | 1293 | 2 |
| 1 | 1339 | 3.5 |
| 1 | 1343 | 2 |
| 1 | 1371 | 2.5 |
2、參考代碼:
? ?
# -*- coding: utf-8 -*- ''' Created on 2017年9月18日@author: Jason.F '''import math import random import os from itertools import isliceclass ItemBasedCF:def __init__(self, datafile = None):self.datafile = datafileself.readData()self.splitData()def readData(self,datafile = None):self.datafile = datafile or self.datafileself.data = []file = open(self.datafile,'r')for line in islice(file, 1, None): #file.readlines():userid, itemid, record = line.split(',')self.data.append((userid,itemid,float(record)))def splitData(self,data=None,k=3,M=10,seed=10):self.testdata = {}self.traindata = {}data = data or self.datarandom.seed(seed)#生成隨機數for user,item,record in self.data:self.traindata.setdefault(user,{})self.traindata[user][item] = record #全量訓練if random.randint(0,M) == k:#測試集self.testdata.setdefault(user,{})self.testdata[user][item] = record def ItemSimilarity(self, train = None):train = train or self.traindataself.itemSim = dict()item_user_count = dict() #item_user_count{item: likeCount} the number of users who like the itemcount = dict() #count{i:{j:value}} the number of users who both like item i and jfor user,item in train.items(): #initialize the user_items{user: items}for i in item.keys():item_user_count.setdefault(i,0)item_user_count[i] += 1for j in item.keys():if i == j:continuecount.setdefault(i,{})count[i].setdefault(j,0)count[i][j] += 1for i, related_items in count.items():self.itemSim.setdefault(i,dict())for j, cuv in related_items.items():self.itemSim[i].setdefault(j,0)self.itemSim[i][j] = cuv / math.sqrt(item_user_count[i] * item_user_count[j] * 1.0)def recommend(self,user,train = None, k = 10,nitem = 5):train = train or self.traindatarank = dict()ru = train.get(user,{})for i,pi in ru.items():for j,wj in sorted(self.itemSim[i].items(), key = lambda x:x[1], reverse = True)[0:k]:if j in ru:continuerank.setdefault(j,0)rank[j] += pi*wjreturn dict(sorted(rank.items(), key = lambda x:x[1], reverse = True)[0:nitem])def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 5):train = train or self.traindatatest = test or self.testdatahit = 0recall = 0precision = 0for user in test.keys():tu = test.get(user,{})rank = self.recommend(user,train = train,k = k,nitem = nitem)for item,_ in rank.items():if item in tu:hit += 1recall += len(tu)precision += nitemreturn (hit / (recall * 1.0),hit / (precision * 1.0))def coverage(self,train = None,test = None,k = 8,nitem = 5):train = train or self.traindatatest = test or self.testdatarecommend_items = set()all_items = set()for user in test.keys():for item in test[user].keys():all_items.add(item)rank = self.recommend(user, train, k = k, nitem = nitem)for item,_ in rank.items():recommend_items.add(item)return len(recommend_items) / (len(all_items) * 1.0)def popularity(self,train = None,test = None,k = 8,nitem = 5):train = train or self.traindatatest = test or self.testdataitem_popularity = dict()for user ,items in train.items():for item in items.keys():item_popularity.setdefault(item,0)item_popularity[item] += 1ret = 0n = 0for user in test.keys():rank = self.recommend(user, train, k = k, nitem = nitem)for item ,_ in rank.items():ret += math.log(1+item_popularity[item])n += 1return ret / (n * 1.0)def testRecommend(self,user):rank = self.recommend(user,k = 10,nitem = 5)for i,rvi in rank.items():items = self.traindata.get(user,{})record = items.get(i,0)print ("%5s: %.4f--%.4f" %(i,rvi,record))if __name__ == "__main__":ibc=ItemBasedCF(os.getcwd()+'\\ratings.csv')#初始化數據ibc.ItemSimilarity()#計算物品相似度矩陣ibc.testRecommend(user = "345") #單用戶推薦print ("%3s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity'))for k in [5,10,15,20]:recall,precision = ibc.recallAndPrecision( k = k)coverage =ibc.coverage(k = k)popularity =ibc.popularity(k = k)print ("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity))問題:python循環太慢,要想辦法在pandas的dataframe內完成。
補充:在ItemSimilarity函數中,會出現物品只有一次的行為,所以代碼更改為:
再補充:在recommend函數中,如果排除原物品,則會出現準確率和召回率為零的情況,修正如下:
發現項亮的《推薦系統實踐》中代碼也是如此,可見是一個大的問題,沒有人發現和提出。
總結
以上是生活随笔為你收集整理的【知识发现】基于物品的协同过滤推荐算法python实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【正一专栏】梅西!梅西!梅西!
- 下一篇: 【正一专栏】恒大中超七连冠到手了