基于huggingface/transforms-PyTorch框架实现Bert文本分类

  • 背景
  • 项目结构
  • 安装依赖包
  • 数据与预训练模型
  • 数据
  • 预训练模型
  • 代码部分


背景

作者在使用bert_keras实现bert文本分类模型后发现训练时并不能使用GPU加速训练,因此想使用huggingface/transforms框架实现bert文本分类模型,但是由于不清楚其模型输入格式、API没有中文介绍等原因,在实现过程中还是废了不少力,因此想在此分享一下现成的代码供大家使用。https://github.com/huggingface/transformers是huggingface/transforms源码地址。

项目结构

项目结构如下:

英文文本分类pytorch代码 pytorch transformer 文本分类_深度学习

  • bert-pretrain:
    用来训练自己的预训练模型。你也可以直接使用google已训练好的模型作为预训练模型。
  • bert_nlpc/preTrain_model:
    存放你的预训练模型,vocab.txt,和bert_config.json。
  • bert_nlpc/data:
    存放你的训练和验证集的数据。
  • bert_nlpc/bert-base-transformers:
    存放bert模型代码。

安装依赖包

主要是windows下,pytorch的安装不能直接使用pip进行安装,安装命令如下:

#windows:
pip install torch===1.6.0 torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
#linux:
pip install torch torchvision

数据与预训练模型

数据

数据存储在csv文件中(如果你理解了数据处理过程也可以替换存储方式),格式如下,第一列为label,第2列为文本

英文文本分类pytorch代码 pytorch transformer 文本分类_自然语言处理_02

预训练模型

我没有使用transforms框架进行预训练,然而PyTorch加载模型需要模型存储的格式为bin,因此需要先将ckpt格式的文件转换为bin格式。代码如下:

cd bert_nlpc/preTrain_model	#进入到bert_model.ckpt所在目录
transformers-cli convert --model_type bert \
  --tf_checkpoint bert_model.ckpt \
  --config bert_config.json \
  --pytorch_dump_output pytorch_model.bin

代码部分

import random
import numpy as np
import torch
import csv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer,BertConfig,BertForSequenceClassification,AdamW,AutoTokenizer,AutoModel
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

class BertModle:
    def __init__(self,train,validation,vocab_path,config_path,pretrain_Model_path,saveModel_path,learning_rate,n_class,epochs,batch_size,val_batch_size,max_len,gpu=True):
        self.n_class = n_class	#类别数
        self.max_len = max_len	#句子最大长度
        self.lr = learning_rate	#学习率
        self.tokenizer = BertTokenizer.from_pretrained(vocab_path)	#加载分词模型
        self.train = self.load_data(train)	#加载训练数据集
        self.validation = self.load_data(validation)	#加载测试数据集
        self.epochs = epochs	
        self.batch_size = batch_size	#训练集的batch_size
        self.val_batch_size = val_batch_size
        self.saveModel_path = saveModel_path	#模型存储位置
        self.gpu = gpu	#是否使用gpu
        config = BertConfig.from_json_file(config_path)	#加载bert模型配置信息
        config.num_labels = n_class	#设置分类模型的输出个数
        self.model = BertForSequenceClassification.from_pretrained(pretrain_Model_path,config=config)	#加载bert分类模型
        if self.gpu:
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            torch.backends.cudnn.deterministic = True
            self.device = torch.device('cuda')
        else:
            self.device = 'cpu'

    def encode_fn(self,text_list):
    	#将text_list embedding成bert模型可用的输入形式
    	#text_list:['我爱你','猫不是狗']
        tokenizer = self.tokenizer(
            text_list,
            padding = True,
            truncation = True,
            max_length = self.max_len,
            return_tensors='pt'  # 返回的类型为pytorch tensor
        )
        input_ids = tokenizer['input_ids']
        token_type_ids = tokenizer['token_type_ids']
        attention_mask = tokenizer['attention_mask']
        return input_ids,token_type_ids,attention_mask

    def load_data(self,path):
    	#只能处理csv文件
        text_list = []
        labels = []
        for line in csv.reader(open(path,encoding='gbk')):
            label = int(line[0])	#这里可以改,label在什么位置就改成对应的index
            text = line[1]
            text_list.append(text)
            labels.append(label)
        input_ids,token_type_ids,attention_mask = self.encode_fn(text_list)
        labels = torch.tensor(labels)
        data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
        return data

    def load_data_predict(self,path):
    	#加载文件,用于bert分类模型的预测
        text_list = []
        labels = []
        for line in csv.reader(open(path, encoding='gbk')):
            text = line[1]
            text_list.append(text)
            label = int(line[0])
            labels.append(label)
        return text_list,labels

    def flat_accuracy(self, preds, labels):
        """A function for calculating accuracy scores"""
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return accuracy_score(labels_flat, pred_flat)

    def train_model(self):
    	#训练模型
        if self.gpu:
            self.model.cuda()
        optimizer = AdamW(self.model.parameters(), lr=self.lr)
        trainData = DataLoader(self.train, batch_size = self.batch_size, shuffle = True)	#处理成多个batch的形式
        valData = DataLoader(self.validation, batch_size = self.val_batch_size, shuffle = True)

        total_steps = len(trainData) * self.epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        for epoch in range(self.epochs):
            self.model.train()
            total_loss, total_val_loss = 0, 0
            total_eval_accuracy = 0
            print('epoch:' , epoch , ', step_number:' , len(trainData))
            #训练
            for step,batch in enumerate(trainData):
                self.model.zero_grad()

                loss, logits = self.model(input_ids = batch[0].to(self.device),
                                          token_type_ids=batch[1].to(self.device),
                                          attention_mask=batch[2].to(self.device),
                                          labels=batch[3].to(self.device)
                                          )	#输出loss 和 每个分类对应的输出,softmax后才是预测是对应分类的概率
                total_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                if step % 10 == 0 and step > 0:	#每10步输出一下训练的结果,flat_accuracy()会对logits进行softmax
                    self.model.eval()
                    logits = logits.detach().cpu().numpy()
                    label_ids = batch[3].cuda().data.cpu().numpy()
                    avg_val_accuracy = self.flat_accuracy(logits, label_ids)
                    print('step:' , step)
                    print(f'Accuracy: {avg_val_accuracy:.4f}')
                    print('\n')
			#每个epoch结束,就使用validation数据集评估一次模型
            self.model.eval()
            print('testing ....')
            for i, batch in enumerate(valData):
                with torch.no_grad():
                    loss, logits = self.model(input_ids=batch[0].to(self.device),
                                              token_type_ids=batch[1].to(self.device),
                                              attention_mask=batch[2].to(self.device),
                                              labels=batch[3].to(self.device)
                                              )
                    total_val_loss += loss.item()

                    logits = logits.detach().cpu().numpy()
                    label_ids = batch[3].cuda().data.cpu().numpy()
                    total_eval_accuracy += self.flat_accuracy(logits, label_ids)

            avg_train_loss = total_loss / len(trainData)
            avg_val_loss = total_val_loss / len(valData)
            avg_val_accuracy = total_eval_accuracy / len(valData)

            print(f'Train loss     : {avg_train_loss}')
            print(f'Validation loss: {avg_val_loss}')
            print(f'Accuracy: {avg_val_accuracy:.4f}')
            print('\n')
            self.save_model(self.saveModel_path + '-' + str(epoch))

    def save_model(self , path):
    	#保存分词模型和分类模型
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

    def load_model(self,path):
    	#加载分词模型和分类模型
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = BertForSequenceClassification.from_pretrained(path)
        return tokenizer,model

    def eval_model(self,Tokenizer, model,text_list,y_true):
    	#输出模型的召回率、准确率、f1-score
        preds = self.predict_batch(Tokenizer, model, text_list)
        print(classification_report(y_true,preds))

    def predict_batch(self, Tokenizer, model, text_list):
        tokenizer = Tokenizer(
            text_list,
            padding = True,
            truncation = True,
            max_length = self.max_len,
            return_tensors='pt'  # 返回的类型为pytorch tensor
        )
        input_ids = tokenizer['input_ids']
        token_type_ids = tokenizer['token_type_ids']
        attention_mask = tokenizer['attention_mask']
        pred_data = TensorDataset(input_ids,token_type_ids,attention_mask)
        pred_dataloader = DataLoader(pred_data, batch_size=self.batch_size, shuffle=False)
        model = model.to(self.device)
        model.eval()
        preds = []
        for i, batch in enumerate(pred_dataloader):
            with torch.no_grad():
                outputs = model(input_ids=batch[0].to(self.device),
                                token_type_ids=batch[1].to(self.device),
                                attention_mask=batch[2].to(self.device)
                                )
                logits = outputs[0]
                logits = logits.detach().cpu().numpy()
                preds += list(np.argmax(logits, axis=1))
        return preds

if __name__ == '__main__':
    epoch = 3
    #预训练模型的存储位置为 ../../preTrain_model/gongdan_step5000_ml128/
    #分类模型和分词模型的存储位置是 ../trained_model/bert_model/gongdan_step5000_ml128/
    model_file = 'preTrain_model'	
    trained_model_file = 'bert_model'
    model_name = 'gongdan_step5000_ml128'
    bert_model = BertModle(
        train = '../../data/train.csv',
        validation = '../../data/validation.csv',
        vocab_path = '../../'+ model_file +'/'+ model_name +'/vocab.txt',
        config_path = '../../' + model_file + '/'+ model_name +'/bert_config.json',
        pretrain_Model_path = '../../'+ model_file +'/'+ model_name +'/bert_model.bin',
        saveModel_path = '../trained_model/'+ trained_model_file+'/'+model_name,
        learning_rate = 2e-5,
        n_class = 8,
        epochs = epoch,
        batch_size = 4,
        val_batch_size = 4,
        max_len = 100 ,
        gpu = True
    )
    bert_model.train_model()
    Tokenizer,model = bert_model.load_model(bert_model.saveModel_path + '-'+str(epoch-1))
    text_list,y_true = bert_model.load_data_predict('../../data/validation.csv')
    bert_model.eval_model(Tokenizer, model,text_list,y_true)