数据增强是一种有效的防止过拟合并且可以加大训练集的有效方式,NLP中的数据增强方法有回译,同义词替换,随机插入,随机交换,随机删除,对抗训练这几种方式,本篇文章简单介绍一下这几种方式。

1.回译

#经过测试,这个翻译的包翻译的时间是最短的
from pygtrans import Translate

def backTran(wordss):
    client = Translate()
    text1 = client.translate(wordss)

    text2 = client.translate(text1.translatedText, target='en')
    return text2.translatedText

使用样例

words="hello world"#传入一个英文的字符串就行
backTranWord=backTran(words)

2.同义词替换

########################################################################
# 同义词替换
# 替换一个语句中的n个单词为其同义词
########################################################################


from nltk.corpus import stopwords#引入停用词,因为对停用词进行数据增强相当于没有增强
from nltk.corpus import wordnet as wn#引入同义词
import random
stop_words=stopwords.words('english')
for w in ['!',',','.','?','-s','-ly','</s>','s']:
    stop_words.add(w)
   
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))     
    random.shuffle(random_word_list)
    num_replaced = 0  
    for random_word in random_word_list:          
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)   
            new_words = [synonym if word == random_word else word for word in new_words]   
            num_replaced += 1
        if num_replaced >= n: 
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return " ".join(new_words)
#获取同义词
def get_synonyms(word):
    nearbyWordSet=wn.synsets(word)
    return nearbyWordSet[0].lemma_names()

使用样例

words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=synonym_replacement(words.split(" "),lenW)

3.随机插入

########################################################################
# 随机插入
# 随机在语句中插入n个词
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return " ".join(new_words)
#插入单词,这里插入随机挑选的单词的同义词
def add_word(new_words):
    synonyms = []
    counter = 0    
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = random.choice(synonyms)
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

使用样例

words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=random_insertion(words.split(" "),lenW)

4.随机交换

########################################################################
# 随机交换
# 随机交换几次
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return " ".join(new_words)

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words
words="hello world"
lenW=len(words)/4#随机替换1/4的词语
newWrods=random_swap(words.split(" "),lenW)

5.随机删除

########################################################################
# 随机删除
# 以概率p删除语句中的词
########################################################################
#这里传入的words是一个列表,
#eg:"hello world".split(" ") or ["hello","world"]
def random_deletion(words, p):

    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return " ".join(new_words)
words="hello world"
#随机替换1/4的词语
newWrods=random_deletion(words.split(" "),1/4)

6.对抗学习进行数据增强

对抗学习对数据进行增强的的原理是对embedding好的词向量上进行扰动,从而达到生成新样本的效果,这个过程是在训练过程中进行的。本次比赛中使用了两种扰动方式,Fast Gradient Method(FGM)

和Projected Gradient Descent(PGD),下面给出使用方法

1) FGM

#定义
import torch
class FGM(object):
    def __init__(self, model):
    	super(FGM, self).__init__()
        self.model = model
        self.backup = {}
 
    def attack(self, epsilon=1., emb_name='emb.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
 
    def restore(self, emb_name='emb.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
# 使用
fgm = FGM(model)
for batch_input, batch_label in data:
    # 正常训练
    loss = model(batch_input, batch_label)
    loss.backward() # 反向传播,得到正常的grad
    # 对抗训练
    fgm.attack() # 在embedding上添加对抗扰动
    loss_adv = model(batch_input, batch_label)
    loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
    fgm.restore() # 恢复embedding参数
    # 梯度下降,更新参数
    optimizer.step()
    model.zero_grad()

PGD

#定义
import torch
class PGD():
    def __init__(self, model):
        self.model = model
        self.emb_backup = {}
        self.grad_backup = {}
 
    def attack(self, epsilon=1., alpha=0.3, emb_name='emb.', is_first_attack=False):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                if is_first_attack:
                    self.emb_backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = alpha * param.grad / norm
                    param.data.add_(r_at)
                    param.data = self.project(name, param.data, epsilon)
 
    def restore(self, emb_name='emb.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.emb_backup
                param.data = self.emb_backup[name]
        self.emb_backup = {}
 
    def project(self, param_name, param_data, epsilon):
        r = param_data - self.emb_backup[param_name]
        if torch.norm(r) > epsilon:
            r = epsilon * r / torch.norm(r)
        return param_data + r
 
    def backup_grad(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.grad_backup[name] = param.grad
 
    def restore_grad(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.grad = self.grad_backup[name]
#使用
pgd = PGD(model)
K = 3
for batch_input, batch_label in data:
    # 正常训练
    loss = model(batch_input, batch_label)
    loss.backward() # 反向传播,得到正常的grad
    pgd.backup_grad()
    # 对抗训练
    for t in range(K):
        pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
        if t != K-1:
            model.zero_grad()
        else:
            pgd.restore_grad()
        loss_adv = model(batch_input, batch_label)
        loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
    pgd.restore() # 恢复embedding参数
    # 梯度下降,更新参数
    optimizer.step()
    model.zero_grad()