基于huggingface/transforms-PyTorch框架实现Bert文本分类
- 背景
- 项目结构
- 安装依赖包
- 数据与预训练模型
- 数据
- 预训练模型
- 代码部分
背景
作者在使用bert_keras实现bert文本分类模型后发现训练时并不能使用GPU加速训练,因此想使用huggingface/transforms框架实现bert文本分类模型,但是由于不清楚其模型输入格式、API没有中文介绍等原因,在实现过程中还是废了不少力,因此想在此分享一下现成的代码供大家使用。https://github.com/huggingface/transformers是huggingface/transforms源码地址。
项目结构
项目结构如下:
- bert-pretrain:
用来训练自己的预训练模型。你也可以直接使用google已训练好的模型作为预训练模型。 - bert_nlpc/preTrain_model:
存放你的预训练模型,vocab.txt,和bert_config.json。 - bert_nlpc/data:
存放你的训练和验证集的数据。 - bert_nlpc/bert-base-transformers:
存放bert模型代码。
安装依赖包
主要是windows下,pytorch的安装不能直接使用pip进行安装,安装命令如下:
#windows:
pip install torch===1.6.0 torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
#linux:
pip install torch torchvision
数据与预训练模型
数据
数据存储在csv文件中(如果你理解了数据处理过程也可以替换存储方式),格式如下,第一列为label,第2列为文本。
预训练模型
我没有使用transforms框架进行预训练,然而PyTorch加载模型需要模型存储的格式为bin,因此需要先将ckpt格式的文件转换为bin格式。代码如下:
cd bert_nlpc/preTrain_model #进入到bert_model.ckpt所在目录
transformers-cli convert --model_type bert \
--tf_checkpoint bert_model.ckpt \
--config bert_config.json \
--pytorch_dump_output pytorch_model.bin
代码部分
import random
import numpy as np
import torch
import csv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer,BertConfig,BertForSequenceClassification,AdamW,AutoTokenizer,AutoModel
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
class BertModle:
def __init__(self,train,validation,vocab_path,config_path,pretrain_Model_path,saveModel_path,learning_rate,n_class,epochs,batch_size,val_batch_size,max_len,gpu=True):
self.n_class = n_class #类别数
self.max_len = max_len #句子最大长度
self.lr = learning_rate #学习率
self.tokenizer = BertTokenizer.from_pretrained(vocab_path) #加载分词模型
self.train = self.load_data(train) #加载训练数据集
self.validation = self.load_data(validation) #加载测试数据集
self.epochs = epochs
self.batch_size = batch_size #训练集的batch_size
self.val_batch_size = val_batch_size
self.saveModel_path = saveModel_path #模型存储位置
self.gpu = gpu #是否使用gpu
config = BertConfig.from_json_file(config_path) #加载bert模型配置信息
config.num_labels = n_class #设置分类模型的输出个数
self.model = BertForSequenceClassification.from_pretrained(pretrain_Model_path,config=config) #加载bert分类模型
if self.gpu:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
self.device = torch.device('cuda')
else:
self.device = 'cpu'
def encode_fn(self,text_list):
#将text_list embedding成bert模型可用的输入形式
#text_list:['我爱你','猫不是狗']
tokenizer = self.tokenizer(
text_list,
padding = True,
truncation = True,
max_length = self.max_len,
return_tensors='pt' # 返回的类型为pytorch tensor
)
input_ids = tokenizer['input_ids']
token_type_ids = tokenizer['token_type_ids']
attention_mask = tokenizer['attention_mask']
return input_ids,token_type_ids,attention_mask
def load_data(self,path):
#只能处理csv文件
text_list = []
labels = []
for line in csv.reader(open(path,encoding='gbk')):
label = int(line[0]) #这里可以改,label在什么位置就改成对应的index
text = line[1]
text_list.append(text)
labels.append(label)
input_ids,token_type_ids,attention_mask = self.encode_fn(text_list)
labels = torch.tensor(labels)
data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
return data
def load_data_predict(self,path):
#加载文件,用于bert分类模型的预测
text_list = []
labels = []
for line in csv.reader(open(path, encoding='gbk')):
text = line[1]
text_list.append(text)
label = int(line[0])
labels.append(label)
return text_list,labels
def flat_accuracy(self, preds, labels):
"""A function for calculating accuracy scores"""
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return accuracy_score(labels_flat, pred_flat)
def train_model(self):
#训练模型
if self.gpu:
self.model.cuda()
optimizer = AdamW(self.model.parameters(), lr=self.lr)
trainData = DataLoader(self.train, batch_size = self.batch_size, shuffle = True) #处理成多个batch的形式
valData = DataLoader(self.validation, batch_size = self.val_batch_size, shuffle = True)
total_steps = len(trainData) * self.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
for epoch in range(self.epochs):
self.model.train()
total_loss, total_val_loss = 0, 0
total_eval_accuracy = 0
print('epoch:' , epoch , ', step_number:' , len(trainData))
#训练
for step,batch in enumerate(trainData):
self.model.zero_grad()
loss, logits = self.model(input_ids = batch[0].to(self.device),
token_type_ids=batch[1].to(self.device),
attention_mask=batch[2].to(self.device),
labels=batch[3].to(self.device)
) #输出loss 和 每个分类对应的输出,softmax后才是预测是对应分类的概率
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
optimizer.step()
scheduler.step()
if step % 10 == 0 and step > 0: #每10步输出一下训练的结果,flat_accuracy()会对logits进行softmax
self.model.eval()
logits = logits.detach().cpu().numpy()
label_ids = batch[3].cuda().data.cpu().numpy()
avg_val_accuracy = self.flat_accuracy(logits, label_ids)
print('step:' , step)
print(f'Accuracy: {avg_val_accuracy:.4f}')
print('\n')
#每个epoch结束,就使用validation数据集评估一次模型
self.model.eval()
print('testing ....')
for i, batch in enumerate(valData):
with torch.no_grad():
loss, logits = self.model(input_ids=batch[0].to(self.device),
token_type_ids=batch[1].to(self.device),
attention_mask=batch[2].to(self.device),
labels=batch[3].to(self.device)
)
total_val_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = batch[3].cuda().data.cpu().numpy()
total_eval_accuracy += self.flat_accuracy(logits, label_ids)
avg_train_loss = total_loss / len(trainData)
avg_val_loss = total_val_loss / len(valData)
avg_val_accuracy = total_eval_accuracy / len(valData)
print(f'Train loss : {avg_train_loss}')
print(f'Validation loss: {avg_val_loss}')
print(f'Accuracy: {avg_val_accuracy:.4f}')
print('\n')
self.save_model(self.saveModel_path + '-' + str(epoch))
def save_model(self , path):
#保存分词模型和分类模型
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)
def load_model(self,path):
#加载分词模型和分类模型
tokenizer = AutoTokenizer.from_pretrained(path)
model = BertForSequenceClassification.from_pretrained(path)
return tokenizer,model
def eval_model(self,Tokenizer, model,text_list,y_true):
#输出模型的召回率、准确率、f1-score
preds = self.predict_batch(Tokenizer, model, text_list)
print(classification_report(y_true,preds))
def predict_batch(self, Tokenizer, model, text_list):
tokenizer = Tokenizer(
text_list,
padding = True,
truncation = True,
max_length = self.max_len,
return_tensors='pt' # 返回的类型为pytorch tensor
)
input_ids = tokenizer['input_ids']
token_type_ids = tokenizer['token_type_ids']
attention_mask = tokenizer['attention_mask']
pred_data = TensorDataset(input_ids,token_type_ids,attention_mask)
pred_dataloader = DataLoader(pred_data, batch_size=self.batch_size, shuffle=False)
model = model.to(self.device)
model.eval()
preds = []
for i, batch in enumerate(pred_dataloader):
with torch.no_grad():
outputs = model(input_ids=batch[0].to(self.device),
token_type_ids=batch[1].to(self.device),
attention_mask=batch[2].to(self.device)
)
logits = outputs[0]
logits = logits.detach().cpu().numpy()
preds += list(np.argmax(logits, axis=1))
return preds
if __name__ == '__main__':
epoch = 3
#预训练模型的存储位置为 ../../preTrain_model/gongdan_step5000_ml128/
#分类模型和分词模型的存储位置是 ../trained_model/bert_model/gongdan_step5000_ml128/
model_file = 'preTrain_model'
trained_model_file = 'bert_model'
model_name = 'gongdan_step5000_ml128'
bert_model = BertModle(
train = '../../data/train.csv',
validation = '../../data/validation.csv',
vocab_path = '../../'+ model_file +'/'+ model_name +'/vocab.txt',
config_path = '../../' + model_file + '/'+ model_name +'/bert_config.json',
pretrain_Model_path = '../../'+ model_file +'/'+ model_name +'/bert_model.bin',
saveModel_path = '../trained_model/'+ trained_model_file+'/'+model_name,
learning_rate = 2e-5,
n_class = 8,
epochs = epoch,
batch_size = 4,
val_batch_size = 4,
max_len = 100 ,
gpu = True
)
bert_model.train_model()
Tokenizer,model = bert_model.load_model(bert_model.saveModel_path + '-'+str(epoch-1))
text_list,y_true = bert_model.load_data_predict('../../data/validation.csv')
bert_model.eval_model(Tokenizer, model,text_list,y_true)