文章目錄
- 1. 數(shù)據(jù)處理
- 2. 下載預(yù)訓(xùn)練模型
- 3. 加載數(shù)據(jù)
- 4. 定義模型
- 5. 訓(xùn)練
- 6. 提交測試結(jié)果
練習(xí)地址:https://www.kaggle.com/c/ds100fa19
相關(guān)博文:
[Kaggle] Spam/Ham Email Classification 垃圾郵件分類(spacy)
[Kaggle] Spam/Ham Email Classification 垃圾郵件分類(RNN/GRU/LSTM)
本文使用 huggingface 上的預(yù)訓(xùn)練模型,在預(yù)訓(xùn)練模型的基礎(chǔ)上,使用垃圾郵件數(shù)據(jù)集,進(jìn)行訓(xùn)練 finetune,在kaggle提交測試結(jié)果
本文代碼參考了《自然語言處理動手學(xué)Bert文本分類》
1. 數(shù)據(jù)處理
from datetime
import timedelta
import torch
import torch
.nn
as nn
import pandas
as pd
import numpy
as np
train
= pd
.read_csv
("train.csv")
test_csv
= pd
.read_csv
("test.csv")
train
= train
.fillna
(" ")
test_csv
= test_csv
.fillna
(" ")
train
['all'] = train
['subject'] + ' ' + train
['email']
from sklearn
.model_selection
import StratifiedShuffleSplit
splt
= StratifiedShuffleSplit
(n_splits
=1,test_size
=0.2,random_state
=1)
for train_idx
, valid_idx
in splt
.split
(train
, train
['spam']):train_part
= train
.loc
[train_idx
]valid_part
= train
.loc
[valid_idx
]y_train
= train_part
['spam']
y_valid
= valid_part
['spam']
X_train
= train_part
['all']
X_valid
= valid_part
['all']X_test
= test_csv
['subject'] + ' ' + test_csv
['email']
y_test
= [0]*len(X_test
)
y_test
= torch
.LongTensor
(y_test
)
2. 下載預(yù)訓(xùn)練模型
預(yù)訓(xùn)練模型
模型下載很慢的話,我傳到 csdn了,可以免費下載
以上模型文件放在一個文件夾里,如./bert_hugginggace/
提前安裝包
pip install transformers
from transformers
import AutoTokenizer
, AutoModelForSequenceClassificationtokenizer
= AutoTokenizer
.from_pretrained
("./bert_hugginggace")
pretrain_model
= AutoModelForSequenceClassification
.from_pretrained
("./bert_hugginggace")
一些使用的參數(shù)
PAD
, CLS
= '[PAD]', '[CLS]'
max_seq_len
= 128
bert_hidden
= 768
num_classes
= 2
learning_rate
= 1e-5
decay
= 0.01
num_epochs
= 5
early_stop_time
= 2000
batch_size
= 32
save_path
= "./best_model.ckpt"
device
= torch
.device
('cuda' if torch
.cuda
.is_available
() else 'cpu')
3. 加載數(shù)據(jù)
- 數(shù)據(jù)需要編碼成 bert 需要的格式
需要 token_ids, attention_mask
def load_dataset(texts
, labels
):contents
= []for t
, label
in zip(texts
, labels
):token
= tokenizer
.tokenize
(t
)token
= [CLS
] + tokenseq_len
= len(token
)mask
= []token_ids
= tokenizer
.convert_tokens_to_ids
(token
)if len(token
) < max_seq_len
: mask
= [1]*len(token
) + [0]*(max_seq_len
-len(token
))token_ids
= token_ids
+ [0]*(max_seq_len
-len(token
))else: mask
= [1]*max_seq_lentoken_ids
= token_ids
[:max_seq_len
]seq_len
= max_seq_leny
= [0]*num_classes y
[label
] = 1 contents
.append
((token_ids
, y
, seq_len
, mask
))return contents
- 編寫數(shù)據(jù)集迭代器,訓(xùn)練的時候,每次取出 batch_size 個樣本來更新權(quán)重
class datasetIter():def __init__(self
, datasets
, batch_size
, device
):self
.datasets
= datasetsself
.idx
= 0self
.device
= deviceself
.batch_size
= batch_sizeself
.batches
= len(datasets
)//batch_sizeself
.residues
= Falseif len(datasets
)%batch_size
!= 0:self
.residues
= True def __next__(self
):if self
.residues
and self
.idx
==self
.batches
:batch_data
= self
.datasets
[self
.idx
* self
.batch_size
: len(self
.datasets
)]self
.idx
+= 1batch_data
= self
._to_tensor
(batch_data
)return batch_data
elif self
.idx
> self
.batches
:self
.idx
= 0raise StopIteration
else:batch_data
= self
.datasets
[self
.idx
* self
.batch_size
: (self
.idx
+1) * self
.batch_size
]self
.idx
+= 1batch_data
= self
._to_tensor
(batch_data
)return batch_data
def _to_tensor(self
, datasets
):x
= torch
.LongTensor
([item
[0] for item
in datasets
]).to
(self
.device
)y
= torch
.FloatTensor
([item
[1] for item
in datasets
]).to
(self
.device
)seq_len
= torch
.LongTensor
([item
[2] for item
in datasets
]).to
(self
.device
)mask
= torch
.LongTensor
([item
[3] for item
in datasets
]).to
(self
.device
)return (x
, seq_len
, mask
), y
def __iter__(self
):return self
def __len__(self
):if self
.residues
:return self
.batches
+ 1else:return self
.batches
def build_iter(datasets
, batch_size
, device
):iter = datasetIter
(datasets
,batch_size
,device
)return iter
4. 定義模型
class myModel(nn
.Module
):def __init__(self
):super(myModel
, self
).__init__
()self
.pretrain_model
= pretrain_model
for param
in self
.pretrain_model
.parameters
():param
.requires_grad
= True def forward(self
, x
):context
= x
[0]mask
= x
[2]out
= self
.pretrain_model
(context
, attention_mask
=mask
)out
= torch
.sigmoid
(out
.logits
) return out
5. 訓(xùn)練
import time
import torch
.nn
.functional
as F
from sklearn
import metrics
from transformers
.optimization
import AdamW
def get_time_dif(starttime
):endtime
= time
.time
()return timedelta
(seconds
=int(round(endtime
-starttime
)))
def train(model
, train_iter
, dev_iter
, test_iter
):starttime
= time
.time
() model
.train
()optimizer
= AdamW
(model
.parameters
(),lr
=learning_rate
,weight_decay
=decay
)total_batch
= 0dev_best_loss
= float("inf")last_improve
= 0no_improve_flag
= Falsemodel
.train
()for epoch
in range(num_epochs
):print("Epoch {}/{}".format(epoch
+1, num_epochs
))for i
, (X
, y
) in enumerate(train_iter
):outputs
= model
(X
) model
.zero_grad
() loss
= F
.binary_cross_entropy
(outputs
, y
)loss
.backward
()optimizer
.step
()if total_batch
%100 == 0: truelabels
= torch
.max(y
.data
, 1)[1].cpu
()pred
= torch
.max(outputs
, 1)[1].cpu
()train_acc
= metrics
.accuracy_score
(truelabels
, pred
)dev_acc
, dev_loss
= evaluate
(model
, dev_iter
) if dev_loss
< dev_best_loss
:dev_best_loss
= dev_losstorch
.save
(model
.state_dict
(), save_path
)improve
= '*'last_improve
= total_batch
else:improve
= ' 'time_dif
= get_time_dif
(starttime
)msg
= 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, val Acc :{4:>6.2%}, Time:{5} {6}'print(msg
.format(total_batch
, loss
.item
(),train_acc
, dev_loss
, dev_acc
, time_dif
, improve
))model
.train
()total_batch
+= 1if total_batch
- last_improve
> early_stop_time
:print("no improve after {} times, stop!".format(early_stop_time
))no_improve_flag
= Truebreakif no_improve_flag
:breaktest
(model
, test_iter
)
def evaluate(model
, dev_iter
):model
.eval() loss_total
= 0pred_all
= np
.array
([], dtype
=int)labels_all
= np
.array
([], dtype
=int)with torch
.no_grad
(): for X
, y
in dev_iter
:outputs
= model
(X
)loss
= F
.binary_cross_entropy
(outputs
, y
)loss_total
+= losstruelabels
= torch
.max(y
.data
, 1)[1].cpu
()pred
= torch
.max(outputs
, 1)[1].cpu
().numpy
()labels_all
= np
.append
(labels_all
, truelabels
)pred_all
= np
.append
(pred_all
, pred
)acc
= metrics
.accuracy_score
(labels_all
, pred_all
)return acc
, loss_total
/len(dev_iter
)
def test(model
, test_iter
):model
.load_state_dict
(torch
.load
(save_path
)) model
.eval() pred_all
= np
.array
([], dtype
=int)with torch
.no_grad
():for X
, y
in test_iter
:outputs
= model
(X
)pred
= torch
.max(outputs
, 1)[1].cpu
().numpy
()pred_all
= np
.append
(pred_all
, pred
)id = test_csv
['id']output
= pd
.DataFrame
({'id':id, 'Class': pred_all
})output
.to_csv
("submission_bert.csv", index
=False)
np
.random
.seed
(520)
torch
.manual_seed
(520)
torch
.cuda
.manual_seed_all
(520)
torch
.backends
.cudnn
.deterministic
= True
train_data
= load_dataset
(X_train
, y_train
)
valid_data
= load_dataset
(X_valid
, y_valid
)
test_data
= load_dataset
(X_test
, y_test
)
train_iter
= build_iter
(train_data
, batch_size
, device
)
valid_iter
= build_iter
(valid_data
, batch_size
, device
)
test_iter
= build_iter
(test_data
, batch_size
, device
)
model
= myModel
().to
(device
)
train
(model
, train_iter
, valid_iter
, test_iter
)
6. 提交測試結(jié)果
Private Score:0.98714
Public Score:0.99000
沒怎么調(diào)參,準(zhǔn)確率接近99%,效果還是很不錯的!
歡迎大家提出意見和指正!多謝!
總結(jié)
以上是生活随笔為你收集整理的[Kaggle] Spam/Ham Email Classification 垃圾邮件分类(BERT)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。