BERT(Bidirectional Encoder Representations from Transformers) 是Google AI Language由2019年发表的工作,其在某种意义上开创了NLP领域的新纪元。其采用了Transformer中的encoder架构进行工作,主要做的是MLM以及next sentence predict两个任务,其在大量的无标号的数据上进行预训练,之后进行fine-tune(微调)到相应的子任务数据集。
与之相对应的是openAI的GPT系列,GPT系列使用的transformer中的decoder架构。但是BERT的影响力至少是GPT系列的10倍,被众多研究者广泛使用。
在这里,我分享以下我对于BERT的主体网络代码的解析,因为时间有限,进行的比较仓促,难免会有错误,希望大家多多指教。
对于代码,我使用的是他人复现的pytorch版本。
paper:https://arxiv.org/pdf/1810.04805.pdf&usg=ALkJrhhzxlCL6yTht2BRmH9atgvKFxHsxQ
code:https://github.com/codertimo/BERT-pytorch
1. BERT主体网络代码,对于其中的Transformer的细节我并没有加进去,需要的同学可以去上面的仓库中寻找。
import torch.nn as nn
import torch
import math
from .attention import MultiHeadedAttention
from .utils import SublayerConnection, PositionwiseFeedForward
#-----------------------------------------------------------------#
# BERT: Bidirectional Encoder Representations from Transformers
# 可以直译为BERT:使用Transformer的双向编码器表示
#-----------------------------------------------------------------#
class BERT(nn.Module):
def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
super(BERT,self).__init__()
"""
:param vocab_size: vocab_size of total words
:param hidden: BERT model hidden size
:param n_layers: numbers of Transformer blocks(layers),为Transformer中的encoder的层数
:param attn_heads: number of attention heads
:param dropout: dropout rate
"""
self.hidden = hidden
self.n_layers = n_layers
self.attn_heads = attn_heads
#----------------------------------------------#
# 对于Feed Forward Network,paper中指出
# 对于feed_forward_hidden他们使用了4*hidden_size
#----------------------------------------------#
self.feed_forward_hidden = hidden * 4
#----------------------------------------------#
# 对于BERT的编码操作,在这里是positional, segment
# 以及token embeddings的sum操作
#----------------------------------------------#
self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
#-------------------------------------------------#
# 具有多层transformer encoder的transformer结构
#-------------------------------------------------#
self.transformer_blocks = nn.ModuleList([TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
def forward(self, x, segment_info):
# attention masking for padded token
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
#---------------------------------------------------#
# 将一条句子嵌入转换为sequence of vectors
#---------------------------------------------------#
x = self.embedding(x, segment_info)
#---------------------------------------------------#
# pass through transformer layers
#---------------------------------------------------#
for transformer in self.transformer_blocks:
x = transformer.forward(x, mask)
return x
#---------------------------------------------------------------------------------#
# BERTEmbedding这一个类的定义
# BERT Embedding which is consisted with under features
# 1. TokenEmbedding : normal embedding matrix
# 2. PositionalEmbedding : adding positional information using sin, cos
# 3. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
#---------------------------------------------------------------------------------#
class BERTEmbedding(nn.Module):
def __init__(self, vocab_size, embed_size, dropout=0.1):
"""
:param vocab_size: total vocab size
:param embed_size: embedding size of token embedding
:param dropout: dropout rate
"""
super(BERTEmbedding,self).__init__()
self.embed_size = embed_size
self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
self.dropout = nn.Dropout(p=dropout)
#------------------------------------------------------------------------#
# 将TokenEmbedding,PositionalEmbedding以及SegmentEmbedding
# 三部分相加,之后过Dropout来降低过拟合发生的风险,最后得出BERTEmbedding这一个类的输出
#------------------------------------------------------------------------#
def forward(self, sequence, segment_label):
x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
return self.dropout(x)
#-----------------------------------------------#
# TokenEmbedding以及SegmentEmbedding这两个类的定义
# 两者都继承nn.Embedding这个方法
#-----------------------------------------------#
class TokenEmbedding(nn.Embedding):
def __init__(self, vocab_size, embed_size=512):
super(TokenEmbedding,self).__init__(vocab_size, embed_size, padding_idx=0)
class SegmentEmbedding(nn.Embedding):
def __init__(self, embed_size=512):
super(SegmentEmbedding,self).__init__(3, embed_size, padding_idx=0)
#-----------------------------------#
# PositionalEmbedding这一个类的定义
#-----------------------------------#
class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len=512):
super(PositionalEmbedding,self).__init__()
#---------------#
# 位置信息的嵌入
#---------------#
pe = torch.zeros(max_len, d_model).float()
pe.require_grad = False
position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
#--------------------------------#
# 使用sin对偶数位置上的token进行编码
# 使用cos对奇数位置上的token进行编码
#--------------------------------#
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
#------------------------------------------#
# 对pe进行buffer注册,使得其可以保存到权重中,但不会
# 随着训练的进行而进行梯度更新
#------------------------------------------#
self.register_buffer('pe', pe)
def forward(self, x):
return self.pe[:, :x.size(1)]
#-----------------------------------------#
# 对于Transformer模块的定义
# 在BERT中其只使用了Transformer中的encoder结构
# 而位置信息嵌入以及token Embedding则均在
# BERTEmbedding中完成
#-----------------------------------------#
class TransformerBlock(nn.Module):
"""
Bidirectional Encoder = Transformer (self-attention)
Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
"""
def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
"""
:param hidden: hidden size of transformer
:param attn_heads: head sizes of multi-head attention
:param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
:param dropout: dropout rate
"""
super(TransformerBlock,self).__init__()
self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.dropout = nn.Dropout(p=dropout)
#-------------------------------------#
# 1.run over MHA
# 2.run over input_sublayer(add&norm)
# 3.run over FFN
# 4.run over output_sublayer(add&norm)
# 5.run over dropout
#-------------------------------------#
def forward(self, x, mask):
x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
x = self.output_sublayer(x, self.feed_forward)
return self.dropout(x)
BERT所进行的两个任务的相关模型 1. MLM--Masked Language Model;2.Next Sentence Prediction Model.
import torch.nn as nn
from .bert import BERT
#----------------------------------------------------------------#
# 对于BERT Language Model的定义
# Next Sentence Prediction Model + Masked Language Model
# 当需要在一个类的定义中定义另外一个类的时候,我们需要的定义形式为 bert: BERT
# bert为在该类中的定义,BERT为所调用的类的类名
#----------------------------------------------------------------#
class BERTLM(nn.Module):
def __init__(self, bert: BERT, vocab_size):
"""
:param bert: BERT model which should be trained
:param vocab_size: total vocab size for masked_lm
"""
super(BERTLM,self).__init__()
self.bert = bert
#---------------------------#
# 对于所作用的两个任务的模型的定义
#---------------------------#
self.next_sentence = NextSentencePrediction(self.bert.hidden)
self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size)
#--------------------------------#
# 会输出两个与概率相关的值
#--------------------------------#
def forward(self, x, segment_label):
x = self.bert(x, segment_label)
return self.next_sentence(x), self.mask_lm(x)
#-----------------------------------------------------#
# Next Sentence Prediction Model的定义
# 2-class classification model : is_next, is_not_next
#-----------------------------------------------------#
class NextSentencePrediction(nn.Module):
def __init__(self, hidden):
"""
:param hidden: BERT model output size
"""
super(NextSentencePrediction,self).__init__()
#-------------------------------------#
# Linear层将feature从BERT输出的size-> 2
# 对应is_next, is_not_next这两个类
# dim=-1表示沿着最后一个维度做LogSoftmax
#-------------------------------------#
self.linear = nn.Linear(hidden, 2)
self.softmax = nn.LogSoftmax(dim=-1)
#-------------------------------#
# x[:, 0]表示取所有维度的第0个数据
#-------------------------------#
def forward(self, x):
return self.softmax(self.linear(x[:, 0]))
#------------------------------------------------------#
# 对于MLM模型的定义
# predicting origin token from masked input sequence
# n-class classification problem, n-class = vocab_size
#------------------------------------------------------#
class MaskedLanguageModel(nn.Module):
def __init__(self, hidden, vocab_size):
"""
:param hidden: output size of BERT model
:param vocab_size: total vocab size
"""
super(MaskedLanguageModel,self).__init__()
self.linear = nn.Linear(hidden, vocab_size)
self.softmax = nn.LogSoftmax(dim=-1)
#-----------------------------#
# 表示对x做全连接之后再输出其概率分布
#-----------------------------#
def forward(self, x):
return self.softmax(self.linear(x))