数据增强是一种有效的防止过拟合并且可以加大训练集的有效方式,NLP中的数据增强方法有回译,同义词替换,随机插入,随机交换,随机删除,对抗训练这几种方式,本篇文章简单介绍一下这几种方式。
1.回译
#经过测试,这个翻译的包翻译的时间是最短的
from pygtrans import Translate
def backTran(wordss):
client = Translate()
text1 = client.translate(wordss)
text2 = client.translate(text1.translatedText, target='en')
return text2.translatedText
使用样例
words="hello world"#传入一个英文的字符串就行
backTranWord=backTran(words)
2.同义词替换
########################################################################
# 同义词替换
# 替换一个语句中的n个单词为其同义词
########################################################################
from nltk.corpus import stopwords#引入停用词,因为对停用词进行数据增强相当于没有增强
from nltk.corpus import wordnet as wn#引入同义词
import random
stop_words=stopwords.words('english')
for w in ['!',',','.','?','-s','-ly','</s>','s']:
stop_words.add(w)
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def synonym_replacement(words, n):
new_words = words.copy()
random_word_list = list(set([word for word in words if word not in stop_words]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
synonyms = get_synonyms(random_word)
if len(synonyms) >= 1:
synonym = random.choice(synonyms)
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n:
break
sentence = ' '.join(new_words)
new_words = sentence.split(' ')
return " ".join(new_words)
#获取同义词
def get_synonyms(word):
nearbyWordSet=wn.synsets(word)
return nearbyWordSet[0].lemma_names()
使用样例
words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=synonym_replacement(words.split(" "),lenW)
3.随机插入
########################################################################
# 随机插入
# 随机在语句中插入n个词
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_insertion(words, n):
new_words = words.copy()
for _ in range(n):
add_word(new_words)
return " ".join(new_words)
#插入单词,这里插入随机挑选的单词的同义词
def add_word(new_words):
synonyms = []
counter = 0
while len(synonyms) < 1:
random_word = new_words[random.randint(0, len(new_words)-1)]
synonyms = get_synonyms(random_word)
counter += 1
if counter >= 10:
return
random_synonym = random.choice(synonyms)
random_idx = random.randint(0, len(new_words)-1)
new_words.insert(random_idx, random_synonym)
使用样例
words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=random_insertion(words.split(" "),lenW)
4.随机交换
########################################################################
# 随机交换
# 随机交换几次
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_swap(words, n):
new_words = words.copy()
for _ in range(n):
new_words = swap_word(new_words)
return " ".join(new_words)
def swap_word(new_words):
random_idx_1 = random.randint(0, len(new_words)-1)
random_idx_2 = random_idx_1
counter = 0
while random_idx_2 == random_idx_1:
random_idx_2 = random.randint(0, len(new_words)-1)
counter += 1
if counter > 3:
return new_words
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
return new_words
words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=random_swap(words.split(" "),lenW)
5.随机删除
########################################################################
# 随机删除
# 以概率p删除语句中的词
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_deletion(words, p):
if len(words) == 1:
return words
new_words = []
for word in words:
r = random.uniform(0, 1)
if r > p:
new_words.append(word)
if len(new_words) == 0:
rand_int = random.randint(0, len(words)-1)
return [words[rand_int]]
return " ".join(new_words)
words="hello world"
#随机替换1/4的词语
newWrods=random_deletion(words.split(" "),1/4)
6.对抗学习进行数据增强
对抗学习对数据进行增强的的原理是对embedding好的词向量上进行扰动,从而达到生成新样本的效果,这个过程是在训练过程中进行的。本次比赛中使用了两种扰动方式,Fast Gradient Method(FGM)
和Projected Gradient Descent(PGD),下面给出使用方法
1) FGM
#定义
import torch
class FGM(object):
def __init__(self, model):
super(FGM, self).__init__()
self.model = model
self.backup = {}
def attack(self, epsilon=1., emb_name='emb.'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
self.backup[name] = param.data.clone()
norm = torch.norm(param.grad)
if norm != 0 and not torch.isnan(norm):
r_at = epsilon * param.grad / norm
param.data.add_(r_at)
def restore(self, emb_name='emb.'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
assert name in self.backup
param.data = self.backup[name]
self.backup = {}
# 使用
fgm = FGM(model)
for batch_input, batch_label in data:
# 正常训练
loss = model(batch_input, batch_label)
loss.backward() # 反向传播,得到正常的grad
# 对抗训练
fgm.attack() # 在embedding上添加对抗扰动
loss_adv = model(batch_input, batch_label)
loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
fgm.restore() # 恢复embedding参数
# 梯度下降,更新参数
optimizer.step()
model.zero_grad()
PGD
#定义
import torch
class PGD():
def __init__(self, model):
self.model = model
self.emb_backup = {}
self.grad_backup = {}
def attack(self, epsilon=1., alpha=0.3, emb_name='emb.', is_first_attack=False):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
if is_first_attack:
self.emb_backup[name] = param.data.clone()
norm = torch.norm(param.grad)
if norm != 0 and not torch.isnan(norm):
r_at = alpha * param.grad / norm
param.data.add_(r_at)
param.data = self.project(name, param.data, epsilon)
def restore(self, emb_name='emb.'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
assert name in self.emb_backup
param.data = self.emb_backup[name]
self.emb_backup = {}
def project(self, param_name, param_data, epsilon):
r = param_data - self.emb_backup[param_name]
if torch.norm(r) > epsilon:
r = epsilon * r / torch.norm(r)
return param_data + r
def backup_grad(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
self.grad_backup[name] = param.grad
def restore_grad(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
param.grad = self.grad_backup[name]
#使用
pgd = PGD(model)
K = 3
for batch_input, batch_label in data:
# 正常训练
loss = model(batch_input, batch_label)
loss.backward() # 反向传播,得到正常的grad
pgd.backup_grad()
# 对抗训练
for t in range(K):
pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
if t != K-1:
model.zero_grad()
else:
pgd.restore_grad()
loss_adv = model(batch_input, batch_label)
loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
pgd.restore() # 恢复embedding参数
# 梯度下降,更新参数
optimizer.step()
model.zero_grad()