改文章转载于作者:weixin_40001805
仅供学习参考!!!
之前用bert一直都是根据keras-bert封装库操作的,操作非常简便(可参考苏剑林大佬博客当Bert遇上Keras:这可能是Bert最简单的打开姿势),这次想要来尝试一下基于pytorch的bert实践。
最近pytorch大火,而目前很少有博客完整的给出pytorch-bert的应用代码,本文从最简单的中文文本分类入手,一步一步的给出每段代码~ (代码简单清晰,读者有兴趣可上手实践)
- 首先安装pytorch-bert库, 即:pip install pytorch_pretrained_bert;
- 然后下载预训练模型权重,这里下载的是 chinese_roberta_wwm_ext_pytorch
,下载链接为中文BERT-wwm系列模型 (这里可选择多种模型); - 数据集选择的THUCNews,整理出18w条数据,10类新闻文本的中文分类问题(10分类),每类新闻数据量相等,为1.8w条,数据集来自train.txt(只选择了网址里的train.txt),
数据集的具体格式如下。
下面进入代码阶段。(训练环境为Google Colab)
1.导入必要的库
# coding: UTF-8
import torch
import time
import torch.nn as nn
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig, BertAdam
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import *
path = "data/"
bert_path = "chinese_roberta_wwm_ext_pytorch/"
tokenizer = BertTokenizer(vocab_file=bert_path + "vocab.txt") # 初始化分词器
2.预处理数据集
input_ids = [] # input char ids
input_types = [] # segment ids
input_masks = [] # attention mask
label = [] # 标签
pad_size = 32 # 也称为 max_len (前期统计分析,文本长度最大值为38,取32即可覆盖99%)
with open(path + "train.txt", encoding='utf-8') as f:
for i, l in tqdm(enumerate(f)):
x1, y = l.strip().split('t')
x1 = tokenizer.tokenize(x1)
tokens = ["[CLS]"] + x1 + ["[SEP]"]
# 得到input_id, seg_id, att_mask
ids = tokenizer.convert_tokens_to_ids(tokens)
types = [0] *(len(ids))
masks = [1] * len(ids)
# 短则补齐,长则切断
if len(ids) < pad_size:
types = types + [1] * (pad_size - len(ids)) # mask部分 segment置为1
masks = masks + [0] * (pad_size - len(ids))
ids = ids + [0] * (pad_size - len(ids))
else:
types = types[:pad_size]
masks = masks[:pad_size]
ids = ids[:pad_size]
input_ids.append(ids)
input_types.append(types)
input_masks.append(masks)
# print(len(ids), len(masks), len(types))
assert len(ids) == len(masks) == len(types) == pad_size
label.append([int(y)])
输出:180000it [00:26, 6728.85it/s] (26秒,速度较快)
3.切分训练集和测试集
# 随机打乱索引
random_order = list(range(len(input_ids)))
np.random.seed(2020) # 固定种子
np.random.shuffle(random_order)
print(random_order[:10])
# 4:1 划分训练集和测试集
input_ids_train = np.array([input_ids[i] for i in random_order[:int(len(input_ids)*0.8)]])
input_types_train = np.array([input_types[i] for i in random_order[:int(len(input_ids)*0.8)]])
input_masks_train = np.array([input_masks[i] for i in random_order[:int(len(input_ids)*0.8)]])
y_train = np.array([label[i] for i in random_order[:int(len(input_ids) * 0.8)]])
print(input_ids_train.shape, input_types_train.shape, input_masks_train.shape, y_train.shape)
input_ids_test = np.array([input_ids[i] for i in random_order[int(len(input_ids)*0.8):]])
input_types_test = np.array([input_types[i] for i in random_order[int(len(input_ids)*0.8):]])
input_masks_test = np.array([input_masks[i] for i in random_order[int(len(input_ids)*0.8):]])
y_test = np.array([label[i] for i in random_order[int(len(input_ids) * 0.8):]])
print(input_ids_test.shape, input_types_test.shape, input_masks_test.shape, y_test.shape)
得到结果
4.加载到高效的DataLoader
BATCH_SIZE = 16
train_data = TensorDataset(torch.LongTensor(input_ids_train),
torch.LongTensor(input_types_train),
torch.LongTensor(input_masks_train),
torch.LongTensor(y_train))
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
test_data = TensorDataset(torch.LongTensor(input_ids_test),
torch.LongTensor(input_types_test),
torch.LongTensor(input_masks_test),
torch.LongTensor(y_test))
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)
5.定义bert模型
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(bert_path) # /bert_pretrain/
for param in self.bert.parameters():
param.requires_grad = True # 每个参数都要 求梯度
self.fc = nn.Linear(768, 10) # 768 -> 2
def forward(self, x):
context = x[0] # 输入的句子 (ids, seq_len, mask)
types = x[1]
mask = x[2] # 对padding部分进行mask,和句子相同size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
_, pooled = self.bert(context, token_type_ids=types,
attention_mask=mask,
output_all_encoded_layers=False) # 控制是否输出所有encoder层的结果
out = self.fc(pooled) # 得到10分类
return out
可以发现,bert模型的定义由于高效简易的封装库存在,使得定义模型较为容易,如果想要在bert之后加入cnn/rnn等层,可在这里定义。
6.实例化bert模型
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model().to(DEVICE)
print(model)
得到结果
bert模型结构,未完整输出,可根据这个输出学习bert的内部结构
7.定义优化器
param_optimizer = list(model.named_parameters()) # 模型参数名字列表
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
NUM_EPOCHS = 3
optimizer = BertAdam(optimizer_grouped_parameters,
lr=2e-5,
warmup=0.05,
t_total=len(train_loader) * NUM_EPOCHS
)
# optimizer = torch.optim.Adam(model.parameters(), lr=2e-5) # 简单起见,可用这一行代码完事
8.定义训练函数和测试函数
def train(model, device, train_loader, optimizer, epoch): # 训练模型
model.train()
best_acc = 0.0
for batch_idx, (x1,x2,x3, y) in enumerate(train_loader):
start_time = time.time()
x1,x2,x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
y_pred = model([x1, x2, x3]) # 得到预测结果
model.zero_grad() # 梯度清零
loss = F.cross_entropy(y_pred, y.squeeze()) # 得到loss
loss.backward()
optimizer.step()
if(batch_idx + 1) % 100 == 0: # 打印loss
print('Train Epoch: {} [{}/{} ({:.2f}%)]tLoss: {:.6f}'.format(epoch, (batch_idx+1) * len(x1),
len(train_loader.dataset),
100. * batch_idx / len(train_loader),
loss.item())) # 记得为loss.item()
def test(model, device, test_loader): # 测试模型, 得到测试集评估结果
model.eval()
test_loss = 0.0
acc = 0
for batch_idx, (x1,x2,x3, y) in enumerate(test_loader):
x1,x2,x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
with torch.no_grad():
y_ = model([x1,x2,x3])
test_loss += F.cross_entropy(y_, y.squeeze())
pred = y_.max(-1, keepdim=True)[1] # .max(): 2输出,分别为最大值和最大值的index
acc += pred.eq(y.view_as(pred)).sum().item() # 记得加item()
test_loss /= len(test_loader)
print('nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
test_loss, acc, len(test_loader.dataset),
100. * acc / len(test_loader.dataset)))
return acc / len(test_loader.dataset)
9.开始训练和测试
best_acc = 0.0
PATH = 'roberta_model.pth' # 定义模型保存路径
for epoch in range(1, NUM_EPOCHS+1): # 3个epoch
train(model, DEVICE, train_loader, optimizer, epoch)
acc = test(model, DEVICE, test_loader)
if best_acc < acc:
best_acc = acc
torch.save(model.state_dict(), PATH) # 保存最优模型
print("acc is: {:.4f}, best acc is {:.4f}n".format(acc, best_acc))
输出:(训练时间较长,这里只训练了一个epoch,测试集得到0.9407的accuracy)
10.加载最优模型进行测试
model.load_state_dict(torch.load("roberta_model.pth"))
acc = test(model, DEVICE, test_loader)
# 如果打比赛的话,下面代码也可参考
"""
# 测试集提交
PATH = "roberta_model.pth"
model.load_state_dict(torch.load(PATH))
def test_for_submit(model, device, test_loader): # 测试模型
model.eval()
preds = []
for batch_idx, (x1,x2,x3) in tqdm(enumerate(test_loader)):
x1,x2,x3 = x1.to(device), x2.to(device), x3.to(device)
with torch.no_grad():
y_ = model([x1,x2,x3])
pred = y_.max(-1, keepdim=True)[1].squeeze().cpu().tolist()
# .max() 2输出,分别为最大值和最大值的index
preds.extend(pred)
return preds
preds = test_for_submit(model, DEVICE, test_loader)
"""
得到结果
经过以上10步,即可建立起较为完整的pytorch-bert文本分类体系,代码也较为简单易懂,对读者有帮助记得点个赞呀~
完结-