电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型
載入包
import torch # torch==1.7.1 import torch.nn as nn from torch.utils.data import Dataset from torch.utils.data import DataLoader import os import re import numpy as np from tqdm import tqdm device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高頻的10000詞 MAX_LEN = 300 # 句子統(tǒng)一長度為200 word_count={} # 詞-詞出現(xiàn)的詞數(shù) 詞典數(shù)據(jù)處理
#讀取數(shù)據(jù)集
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3) print('dataset shape is', data.shape)#數(shù)據(jù)清洗
#去除網(wǎng)頁符號 from bs4 import BeautifulSoup example = BeautifulSoup(data['review'][0]) print(example.get_text())#去除非字母元素 import re letters_only = re.sub('[^A-Za-z]', ' ', example.get_text()) print(letters_only)#將大寫字母轉(zhuǎn)化成小寫,并對元素進行劃分 lower_case = letters_only.lower() words = lower_case.split() print(words)#獲取停用詞
# import nltk # nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_list stop_words_file = 'english.txt' stopwords = get_custom_stopwords(stop_words_file) words = [word for word in words if word not in stopwords] ' '.join(words)#打包成數(shù)據(jù)清洗函數(shù)
from bs4 import BeautifulSoup #導(dǎo)入正則表達(dá)式工具包 # import re # from nltk.corpus import stopwords #定義review_to_text函數(shù),完成對原始評論的三項數(shù)據(jù)預(yù)處理任務(wù) def review_to_text(review):#任務(wù)一:去掉html標(biāo)記。raw_text = BeautifulSoup(review,'html').get_text()#任務(wù)二:去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通過指定分隔符對字符串進行切片,如果參數(shù) num 有指定值,則僅分隔 num 個子字符串#這里是先將句子轉(zhuǎn)成小寫字母表示,再按照空格劃分為單詞listwords = letters.lower().split()return words#分別對原始數(shù)據(jù)和測試數(shù)據(jù)集進行上述三項處理
X_data = [] y_data=[] for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment) # # y_data = data['sentiment'] # print(X_data,y_data)#對數(shù)據(jù)集進行拆分
from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)利用傳統(tǒng)機器學(xué)習(xí)模型,樸素貝葉斯
#向量表示,和對數(shù)據(jù)進行學(xué)習(xí),利用樸素貝葉斯分類器
from sklearn.feature_extraction.text import CountVectorizer #5000的含義向量最大長度為5000,選取次數(shù)最多的5000個單詞作為向量下標(biāo) vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) train_data_features = vectorizer.fit_transform(X_train) t_data_features = vectorizer.transform(X_test)from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(train_data_features,y_train) print(nb.score(train_data_features, y_train)) print(nb.score(t_data_features, y_test))# #預(yù)測 # pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty" # pre_str_list=[(' '.join(review_to_text(pre_str)))] # pre_data = vectorizer.transform(pd.Series(pre_str_list)) # result = nb.predict(pre_data) # print(result)訓(xùn)練集精度為0.86,測試集精度為0.84 #output 0.86145 0.8498 [1]**
利用神經(jīng)模型LSTM/GRU進行數(shù)據(jù)學(xué)習(xí)、分類、預(yù)測
**
#將處理好的訓(xùn)練數(shù)據(jù)和測試數(shù)據(jù)寫入新的train.txt和test.txt,便于使用dataset讀取數(shù)據(jù)
with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n") f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n") f1.close()#將英文句子切成單詞,并統(tǒng)計詞頻,生成詞典
def tokenizer(sentence):return sentence.split()def data_process(text): for line in text:tokens = tokenizer(line) # 分詞統(tǒng)計詞數(shù)for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 對詞進行排序,過濾低頻詞,只取前MAX_WORD個高頻詞word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocab#建立詞典
vocab=data_process(X_train) # print(vocab)#GRU模型構(gòu)建,如果要換成LSTM,把nn.GRU換成nn.LSTM即可
class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding層self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形狀是(批量大小,詞數(shù)),因此LSTM需要將序列長度(Seq_len)作為第一維,所以將輸入轉(zhuǎn)置后 再提取詞特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交換維度# LSTM只傳入輸入embeddings,因此只返回最后一層的隱藏層再各時間步的隱藏狀態(tài)# outputs的形狀是(詞數(shù),批量大小, 隱藏單元個數(shù))outputs, _ = self.encoder(embeddings)# 連接初時間步和最終時間步的隱藏狀態(tài)作為全連接層的輸入。形狀為(批量大小, 隱藏單元個數(shù))encoding = outputs[-1] # 取LSTM最后一層結(jié)果outs = self.softmax(self.decoder(encoding)) # 輸出層為二維概率[a,b]return outs#文本向量轉(zhuǎn)化
def text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分詞轉(zhuǎn)為idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 對長度不夠的句子進行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN長度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 將轉(zhuǎn)為idx的詞轉(zhuǎn)為tensor#模型訓(xùn)練
def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定義損失函數(shù)和優(yōu)化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均損失avg_acc = 0 # 平均準(zhǔn)確率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一個epoch結(jié)束后,計算平均loss和評平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存訓(xùn)練完成后的模型參數(shù)torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')#設(shè)計數(shù)據(jù)格式
class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本標(biāo)簽與內(nèi)容file.close()def __getitem__(self, index): # 重寫getitemline = self.text_with_tag[index] # 獲取一個樣本的標(biāo)簽和文本信息label = int(line[0]) # 標(biāo)簽信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)#模型測試
def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc#計算預(yù)測準(zhǔn)確性
def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正確的個數(shù)return acc.detach().cpu().numpy() / len(y_pred)#main函數(shù)
def main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 詞典保存為本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加載本地已經(jīng)存儲的vocab# 構(gòu)建MyDataset實例train_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 構(gòu)建DataLodertrain_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定義模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加載訓(xùn)練好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 測試結(jié)果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)#執(zhí)行
if __name__ == '__main__':main()結(jié)果
參考:
情感分析-IMDB數(shù)據(jù)集
pytorch構(gòu)建LSTM分類器用于IMDB情感分類
數(shù)據(jù)集:
文中涉及到的數(shù)據(jù)集和停用詞表
鏈接:https://pan.baidu.com/s/1OTgLDoE1P9_FPDQaLU1VKw
提取碼:mz6p
全部源碼
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore')import torch # torch==1.7.1 import torch.nn as nn from torch.utils.data import Dataset from torch.utils.data import DataLoader import os import re import numpy as np from tqdm import tqdm device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高頻的10000詞 MAX_LEN = 300 # 句子統(tǒng)一長度為200 word_count={} # 詞-詞出現(xiàn)的詞數(shù) 詞典data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3) print('dataset shape is', data.shape)from bs4 import BeautifulSoup example = BeautifulSoup(data['review'][0]) print(example.get_text())import re letters_only = re.sub('[^A-Za-z]', ' ', example.get_text()) print(letters_only)lower_case = letters_only.lower() words = lower_case.split() print(words)# import nltk # nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_list stop_words_file = 'english.txt' stopwords = get_custom_stopwords(stop_words_file) words = [word for word in words if word not in stopwords] ' '.join(words)from bs4 import BeautifulSoup #導(dǎo)入正則表達(dá)式工具包 # import re # from nltk.corpus import stopwords #定義review_to_text函數(shù),完成對原始評論的三項數(shù)據(jù)預(yù)處理任務(wù) def review_to_text(review):#任務(wù)一:去掉html標(biāo)記。raw_text = BeautifulSoup(review,'html').get_text()#任務(wù)二:去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通過指定分隔符對字符串進行切片,如果參數(shù) num 有指定值,則僅分隔 num 個子字符串#這里是先將句子轉(zhuǎn)成小寫字母表示,再按照空格劃分為單詞listwords = letters.lower().split()#過濾掉停用詞# words = [w for w in words if w not in stopwords]# words = [w for w in words if w not in stopwords.words()]return words#分別對原始數(shù)據(jù)和測試數(shù)據(jù)集進行上述三項處理 X_data = [] y_data=[] for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment) # # y_data = data['sentiment'] # print(X_data,y_data)from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45) print(type(y_train))with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n") f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n") f1.close()# print(X_train) # from sklearn.feature_extraction.text import CountVectorizer # #5000的含義向量最大長度為5000,選取次數(shù)最多的5000個單詞作為向量下標(biāo) # vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) # train_data_features = vectorizer.fit_transform(X_train) # t_data_features = vectorizer.transform(X_test) # # from sklearn.naive_bayes import MultinomialNB # nb = MultinomialNB() # nb.fit(train_data_features,y_train) # print(nb.score(train_data_features, y_train)) # print(nb.score(t_data_features, y_test)) # # #預(yù)測 # pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty" # pre_str_list=[(' '.join(review_to_text(pre_str)))] # pre_data = vectorizer.transform(pd.Series(pre_str_list)) # result = nb.predict(pre_data) # print(result)def tokenizer(sentence):return sentence.split()def data_process(text): # 根據(jù)文本路徑生成文本的標(biāo)簽for line in text:tokens = tokenizer(line) # 分詞統(tǒng)計詞數(shù)for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 對詞進行排序,過濾低頻詞,只取前MAX_WORD個高頻詞word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocabvocab=data_process(X_train) # print(vocab)class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding層## self.encoder=nn.LSTM(input_size=embed_size# ,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形狀是(批量大小,詞數(shù)),因此LSTM需要將序列長度(Seq_len)作為第一維,所以將輸入轉(zhuǎn)置后 再提取詞特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交換維度# LSTM只傳入輸入embeddings,因此只返回最后一層的隱藏層再各時間步的隱藏狀態(tài)# outputs的形狀是(詞數(shù),批量大小, 隱藏單元個數(shù))outputs, _ = self.encoder(embeddings)# 連接初時間步和最終時間步的隱藏狀態(tài)作為全連接層的輸入。形狀為(批量大小, 隱藏單元個數(shù))encoding = outputs[-1] # 取LSTM最后一層結(jié)果outs = self.softmax(self.decoder(encoding)) # 輸出層為二維概率[a,b]return outsdef text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分詞轉(zhuǎn)為idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 對長度不夠的句子進行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN長度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 將轉(zhuǎn)為idx的詞轉(zhuǎn)為tensor# 模型訓(xùn)練 def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定義損失函數(shù)和優(yōu)化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均損失avg_acc = 0 # 平均準(zhǔn)確率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一個epoch結(jié)束后,計算平均loss和評平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存訓(xùn)練完成后的模型參數(shù)torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本標(biāo)簽與內(nèi)容file.close()def __getitem__(self, index): # 重寫getitemline = self.text_with_tag[index] # 獲取一個樣本的標(biāo)簽和文本信息label = int(line[0]) # 標(biāo)簽信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)# 模型測試 def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc# 計算預(yù)測準(zhǔn)確性 def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正確的個數(shù)return acc.detach().cpu().numpy() / len(y_pred)from mxnet.gluon import data as gdata def main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 詞典保存為本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加載本地已經(jīng)存儲的vocab# 構(gòu)建MyDataset實例# train_data = X_train# test_data = X_testtrain_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 構(gòu)建DataLoder# train_data = GetLoader(X_train, y_train)# # test_data=GetLoader(X_test,y_test)# train_data = gdata.ArrayDataset(X_train, y_train)train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定義模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加載訓(xùn)練好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 測試結(jié)果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)if __name__ == '__main__':main()總結(jié)
以上是生活随笔為你收集整理的电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 运维工程师 常见的 trouble sh
- 下一篇: python离线安装第三方库whl_详细