生活随笔
收集整理的這篇文章主要介紹了
二十一、文本情感分类二
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1.1 文本訓練化概述
- 深度學習構建模型前需要將文本轉化為向量表示(Word Embedding)。首先需要將文本轉化為數字(文本序列化),在把數字轉化為向量。
- 可以考慮把文本中的每個詞語和其對應的數字,使用字典保存,同時把句子轉化為數字的列表。
1.2 文本序列化的過程
-
實現文本序列化之前,應考慮一下幾點:
-
如何使用字典把詞語和數字進行對應;
-
不同的詞語出現的次數不盡相同,是否需要對高頻或者低頻詞語進行過濾
-
得到詞典之后,如何把句子轉化為數字序列,如何把數字序列轉化為句子
-
不同句子長度不相同,每個batch的句子如何構造成相同的長度
-
對于新出現的詞語在詞典中沒有出現怎么辦(特殊字符代理)
2. 文本情感分類的模型構建
2.1 文本情感分類具體流程
數據預處理:讀取文本數據,并按照批量的方式加載數據,使用DataLoader完成數據的讀取,具體實現參考data_prepare.py。文本序列化:將文本數據轉化向量表示(Word Embedding),具體實現參考save_ws.py。模型的構建和評估:由model.py實現
2. 2 代碼
import torch
from torch
.utils
.data
import DataLoader
, Dataset
import os
import re
import pickledata_base_path
= r
"data\aclImdb"
ws
= pickle
.load
(open("ws.pkl", "rb"))
max_len
= 20
def tokenize(text
):fileters
= ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>','\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ]text
= re
.sub
("<.*?>", " ", text
, flags
=re
.S
)text
= re
.sub
("|".join
(fileters
), " ", text
, flags
=re
.S
)return [i
.strip
() for i
in text
.split
()]
class ImdbDataset(Dataset
):def __init__(self
, mode
):super(ImdbDataset
, self
).__init__
()if mode
== "train":text_path
= [os
.path
.join
(data_base_path
, i
) for i
in ["train/neg", "train/pos"]]else:text_path
= [os
.path
.join
(data_base_path
, i
) for i
in ["test/neg", "test/pos"]]self
.total_file_path_list
= []for i
in text_path
:self
.total_file_path_list
.extend
([os
.path
.join
(i
, j
) for j
in os
.listdir
(i
)])def __getitem__(self
, idx
):cur_path
= self
.total_file_path_list
[idx
]cur_filename
= os
.path
.basename
(cur_path
)label
= int(cur_filename
.split
("_")[-1].split
(".")[0]) - 1 text
= tokenize
(open(cur_path
, encoding
="utf-8").read
().strip
()) return label
, text
def __len__(self
):return len(self
.total_file_path_list
)def collate_fn(batch
):label
, content
, = list(zip(*batch
))content
= [ws
.transform
(i
, max_len
=max_len
) for i
in content
]content
= torch
.LongTensor
(content
)label
= torch
.LongTensor
(label
)return label
, content
def get_dataloader(train_data
=True):mode
= ""if train_data
:mode
= "train"imdb_dataset
= ImdbDataset
(mode
)dataloader
= DataLoader
(dataset
=imdb_dataset
, batch_size
=10, shuffle
=True, collate_fn
=collate_fn
)return dataloader
if __name__
== '__main__':text
= "I cannot stay indifferent<br></br> to Lars| van Trier's films. "s
= tokenize
(text
)dataset
= ImdbDataset
(mode
="train")dataloader
= get_dataloader
()for idx
, (label
, text
) in enumerate(dataloader
):print("idx:", idx
)print("table:", label
)print("text:", text
)break
class ImdbDataset(Dataset
):def __init__(self
, mode
):super(ImdbDataset
, self
).__init__
()if mode
== "train":text_path
= [os
.path
.join
(data_base_path
, i
) for i
in ["train/neg", "train/pos"]]else:text_path
= [os
.path
.join
(data_base_path
, i
) for i
in ["test/neg", "test/pos"]]self
.total_file_path_list
= []for i
in text_path
:self
.total_file_path_list
.extend
([os
.path
.join
(i
, j
) for j
in os
.listdir
(i
)])def __getitem__(self
, idx
):cur_path
= self
.total_file_path_list
[idx
]cur_filename
= os
.path
.basename
(cur_path
)label
= int(cur_filename
.split
("_")[-1].split
(".")[0]) - 1 text
= tokenize
(open(cur_path
).read
().strip
()) return label
, text
def __len__(self
):return len(self
.total_file_path_list
)
import pickle
import torch
import torch
.nn
as nn
import torch
.nn
.functional
as F
from torch
.optim
import Adam
from unit21
.data_prepare
import get_dataloader
, max_lenws
= pickle
.load
(open("ws.pkl", "rb"))class IMDBModel(nn
.Module
):def __init__(self
,max_len
):super(IMDBModel
,self
).__init__
()self
.embedding
= nn
.Embedding
(len(ws
),300,padding_idx
=ws
.PAD
) self
.fc
= nn
.Linear
(max_len
*300,10) def forward(self
, x
):embed
= self
.embedding
(x
) embed
= embed
.view
(x
.size
(0),-1)out
= self
.fc
(embed
)return F
.log_softmax
(out
,dim
=-1)model
= IMDBModel
(max_len
)
optimizer
= Adam
(model
.parameters
(), 0.001)
def train(epoch
):train_dataloader
= get_dataloader
()for idx
, (target
, input) in enumerate(train_dataloader
):optimizer
.zero_grad
()output
= model
(input)loss
= F
.nll_loss
(output
, target
) loss
.backward
()optimizer
.step
()if idx
% 10 == 0:print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch
, idx
* len(input), len(train_dataloader
.dataset
),100. * idx
/ len(train_dataloader
), loss
.item
()))torch
.save
(model
.state_dict
(), "imdb_net.pkl")def test():test_loss
= 0correct
= 0train
= Falsemodel
.load_state_dict
(torch
.load
("imdb_net.pkl"))model
.eval()test_dataloader
= get_dataloader
()with torch
.no_grad
():for target
, input in test_dataloader
:output
= model
(input)test_loss
+= F
.nll_loss
(output
, target
, reduction
="sum")pred
= torch
.max(output
, dim
=-1, keepdim
=False)[-1]correct
= pred
.eq
(target
.data
).sum()test_loss
= test_loss
/ len(test_dataloader
.dataset
)print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(test_loss
, correct
, len(test_dataloader
.dataset
),100. * correct
/ len(test_dataloader
.dataset
)))if __name__
== '__main__':epoch
= 1train
(epoch
)
總結
以上是生活随笔為你收集整理的二十一、文本情感分类二的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。