【Python-ML】电影评论数据集文本挖掘 -在线学习
生活随笔
收集整理的這篇文章主要介紹了
【Python-ML】电影评论数据集文本挖掘 -在线学习
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
# -*- coding: utf-8 -*-
'''
Created on 2018年1月22日
@author: Jason.F
@summary: 文本挖掘,對電影評論進行內容抽取、特征向量化并訓練模型預測,在線學習并持久化模型
電影評論數據:http://ai.stanford.edu/~amaas/data/sentiment/
'''
import pyprind
import pandas as pd
import os
import numpy as np
import re
import time
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifierstart = time.clock()homedir = os.getcwd()#獲取當前文件的路徑
#導入數據并輸出到moive_data.csv
'''
pbar=pyprind.ProgBar(50000)
labels={'pos':1,'neg':0}#正面和負面評論標簽
df = pd.DataFrame()
for s in ('test','train'):for l in ('pos','neg'):path=homedir+'/aclImdb/%s/%s' %(s,l)for file in os.listdir(path):with open(os.path.join(path,file),'r') as infile:txt =infile.read()df =df.append([[txt,labels[l]]],ignore_index=True)pbar.update()
df.columns=['review','sentiment']
np.random.seed(0)
df=df.reindex(np.random.permutation(df.index))#重排數據集,打散正負樣本數據
df.to_csv(homedir+'/movie_data.csv',index=False)
'''
#文本向量化,并訓練模型和更新
df=pd.read_csv(homedir+'/movie_data.csv')
stop = stopwords.words('english')#獲得英文停用詞集
def tokenizer(text):text=re.sub('<[^>]*>','',text)#移除HTML標記,#把<>里面的東西刪掉包括內容emotions=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)text=re.sub('[\W]+',' ',text.lower())+' '.join(emotions).replace('-','')tokenized = [w for w in text.split() if w not in stop]return tokenized
def stream_docs(path):with open(path,'r') as csv:next(csv) #skip headerfor line in csv:text,label = line[:-3] ,int(line[-2])yield text,label
def get_minibatch(doc_stream,size):docs,y =[],[]try:for _ in range(size):text,label =next(doc_stream)docs.append(text)y.append(label)except StopIteration:return None,Nonereturn docs,y
vect=HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier (loss='log',random_state=1,n_iter=1)#隨機梯度下降,每次用一個樣本更新權重
doc_stream = stream_docs(path=homedir+'/movie_data.csv')
pbar = pyprind.ProgBar(45)
classes=np.array([0,1])
for _ in range(45):X_train,y_train = get_minibatch(doc_stream, size=1000)if not X_train:breakX_train = vect.transform(X_train)clf.partial_fit(X_train, y_train, classes=classes)#部分訓練pbar.update()
#測試
X_test,y_test=get_minibatch(doc_stream, size=5000)
X_test=vect.transform(X_test)
print ('Accuracy:%.3f' %clf.score(X_test,y_test))
clf=clf.partial_fit(X_test,y_test)#更新模型
#持久化模型
dest=os.path.join('pkl_objects')
if not os.path.exists(dest):os.makedirs(dest)
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=2)#保存停用詞
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=2)#保存模型
#導入模型預測
clf =pickle.load(open(os.path.join('pkl_objects','classifier.pkl'),'rb'))
label ={0:'negative',1:'positive'}
example=['I love this movie']
X=vect.transform(example)
print ('Prediction:%s \nProbability:%.2f%%'%(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))end = time.clock()
print('finish all in %s' % str(end - start))
結果:
Warning: No valid output stream. Accuracy:0.867 Prediction:positive Probability:82.53% finish all in 50.6331459967總結
以上是生活随笔為你收集整理的【Python-ML】电影评论数据集文本挖掘 -在线学习的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【Python-ML】电影评论数据集文本
- 下一篇: 【Python-ML】探索式数据分析ED