用paddlepaddle的LSTM进行文本分类的代码

# encoding=utf8
import re
import random
import tarfile
import requests
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
from paddle.nn import LSTM, Embedding, Dropout, Linear
import paddle.fluid as fluid

import jieba

# 加载数据
def load_imdb():
    data_set = []
    index = 0
    label_dic = {}
    with open("thuc_data/cnews.val.txt", "rb") as file:
        for line in file:
            line_1 = line.decode("utf8","ignore")
            index += 1
            ll = line_1.strip().split("\t")
            if(len(ll)!=2):
                continue
            sentence_label = ll[0]
            content = ll[1]
            # 使用jieba分词对文本进行分词
            sentence = jieba.cut(content, cut_all=False)
            data_set.append((" ".join(sentence), sentence_label)) 

      
        
    return data_set
            
            
            
                
            
train_corpus = load_imdb()

for i in range(100):
    print("sentence %d, %d" % (i, len(train_corpus[i][0].split(" "))))  
    print("sentence %d, label %s" % (i, train_corpus[i][1]))

# 构造词典,统计每个词的频率,并根据频率将每个词转换为一个整数id
def build_dict(corpus):
    
    word_freq_dict = dict()
    for sentence, _ in corpus:
        
        for word in sentence.split(" "):
            if word not in word_freq_dict:
                word_freq_dict[word] = 0
            word_freq_dict[word] += 1

    word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
    
    word2id_dict = dict()
    word2id_freq = dict()

    # 一般来说,我们把oov和pad放在词典前面,给他们一个比较小的id,这样比较方便记忆,并且易于后续扩展词表
    word2id_dict['[oov]'] = 0
    word2id_freq[0] = 1e10

    word2id_dict['[pad]'] = 1
    word2id_freq[1] = 1e10

    for word, freq in word_freq_dict:
        word2id_dict[word] = len(word2id_dict)
        word2id_freq[word2id_dict[word]] = freq

    return word2id_freq, word2id_dict

word2id_freq, word2id_dict = build_dict(train_corpus)
vocab_size = len(word2id_freq)
print("there are totoally %d different words in the corpus" % vocab_size)
for _, (word, word_id) in zip(range(10), word2id_dict.items()):
    print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))



# 把语料转换为id序列
def convert_corpus_to_id(corpus, word2id_dict):
    data_set = []
    label_dict = {
        "体育":1,
    "娱乐":2,
    "家居":3,
    "房产":4,
    "教育":5,
    "时尚":6,
    "时政":7,
    "游戏":8,
    "科技":9,
    "财经":10
        
    }
    
    for sentence, sentence_label in corpus:
        label = label_dict.get(sentence_label, -1)
        # 将句子中的词逐个替换成id,如果句子中的词不在词表内,则替换成oov
        # 这里需要注意,一般来说我们可能需要查看一下test-set中,句子oov的比例,
        # 如果存在过多oov的情况,那就说明我们的训练数据不足或者切分存在巨大偏差,需要调整
        sentence = [word2id_dict[word] if word in word2id_dict \
                    else word2id_dict['[oov]'] for word in sentence.split(" ")]    
        data_set.append((sentence, label))
    return data_set

train_corpus = convert_corpus_to_id(train_corpus, word2id_dict)

print("%d tokens in the corpus" % len(train_corpus))
print(train_corpus[:5])


# 编写一个迭代器,每次调用这个迭代器都会返回一个新的batch,用于训练或者预测
def build_batch(word2id_dict, corpus, batch_size, epoch_num, max_seq_len, shuffle = True, drop_last = True):

    # 模型将会接受的两个输入:
    # 1. 一个形状为[batch_size, max_seq_len]的张量,sentence_batch,代表了一个mini-batch的句子。
    # 2. 一个形状为[batch_size, 1]的张量,sentence_label_batch,每个元素都是非0即1,代表了每个句子的情感类别(正向或者负向)
    sentence_batch = []
    sentence_label_batch = []

    for _ in range(epoch_num): 

        #每个epoch前都shuffle一下数据,有助于提高模型训练的效果
        #但是对于预测任务,不要做数据shuffle
        if shuffle:
            random.shuffle(corpus)

        for sentence, sentence_label in corpus:
            sentence_sample = sentence[:min(max_seq_len, len(sentence))]
            if len(sentence_sample) < max_seq_len:
                for _ in range(max_seq_len - len(sentence_sample)):
                    sentence_sample.append(word2id_dict['[pad]'])
            
            
            sentence_sample = [[word_id] for word_id in sentence_sample]

            sentence_batch.append(sentence_sample)
            sentence_label_batch.append([sentence_label])

            if len(sentence_batch) == batch_size:
                yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
                sentence_batch = []
                sentence_label_batch = []
    if not drop_last and len(sentence_batch) > 0:
        yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")

for batch_id, batch in enumerate(build_batch(word2id_dict, train_corpus, batch_size=10, epoch_num=1, max_seq_len=30)):
    print(batch_id, batch)

# 定义一个用于情感分类的网络实例,SentimentClassifier
class SentimentClassifier(paddle.nn.Layer):
    
    def __init__(self, hidden_size, vocab_size, embedding_size, class_num=11, num_steps=128, num_layers=1, init_scale=0.1, dropout_rate=None):
        
        # 参数含义如下:
        # 1.hidden_size,表示embedding-size,hidden和cell向量的维度
        # 2.vocab_size,模型可以考虑的词表大小
        # 3.embedding_size,表示词向量的维度
        # 4.class_num,情感类型个数,可以是2分类,也可以是多分类
        # 5.num_steps,表示这个情感分析模型最大可以考虑的句子长度
        # 6.num_layers,表示网络的层数
        # 7.dropout_rate,表示使用dropout过程中失活的神经元比例
        # 8.init_scale,表示网络内部的参数的初始化范围,长短时记忆网络内部用了很多Tanh,Sigmoid等激活函数,\
        # 这些函数对数值精度非常敏感,因此我们一般只使用比较小的初始化范围,以保证效果
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.class_num = class_num
        self.num_steps = num_steps
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.init_scale = init_scale
       
        # 声明一个LSTM模型,用来把每个句子抽象成向量
        self.simple_lstm_rnn = paddle.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)

        # 声明一个embedding层,用来把句子中的每个词转换为向量
        self.embedding = paddle.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, sparse=False, 
                                    weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(low=-init_scale, high=init_scale)))
        
        # 声明使用上述语义向量映射到具体情感类别时所需要使用的线性层
        self.cls_fc = paddle.nn.Linear(in_features=self.hidden_size, out_features=self.class_num, 
                             weight_attr=None, bias_attr=None)
        
        # 一般在获取单词的embedding后,会使用dropout层,防止过拟合,提升模型泛化能力
        self.dropout_layer = paddle.nn.Dropout(p=self.dropout_rate, mode='upscale_in_train')

    # forwad函数即为模型前向计算的函数,它有两个输入,分别为:
    # input为输入的训练文本,其shape为[batch_size, max_seq_len]
    # label训练文本对应的情感标签,其shape维[batch_size, 1]
    def forward(self, inputs):
        # 获取输入数据的batch_size
        batch_size = inputs.shape[0]

        # 本实验默认使用1层的LSTM,首先我们需要定义LSTM的初始hidden和cell,这里我们使用0来初始化这个序列的记忆
        init_hidden_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')
        init_cell_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')

        # 将这些初始记忆转换为飞桨可计算的向量,并且设置stop_gradient=True,避免这些向量被更新,从而影响训练效果
        init_hidden = paddle.to_tensor(init_hidden_data)
        init_hidden.stop_gradient = True
        init_cell = paddle.to_tensor(init_cell_data)
        init_cell.stop_gradient = True

        # 对应以上第2步,将输入的句子的mini-batch转换为词向量表示,转换后输入数据shape为[batch_size, max_seq_len, embedding_size]
        x_emb = self.embedding(inputs)
        x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.embedding_size])
        # 在获取的词向量后添加dropout层
        if self.dropout_rate is not None and self.dropout_rate > 0.0:
            x_emb = self.dropout_layer(x_emb)
        
        # 对应以上第3步,使用LSTM网络,把每个句子转换为语义向量
        # 返回的last_hidden即为最后一个时间步的输出,其shape为[self.num_layers, batch_size, hidden_size]
        rnn_out, (last_hidden, last_cell) = self.simple_lstm_rnn(x_emb, (init_hidden, init_cell))
        # 提取最后一层隐状态作为文本的语义向量,其shape为[batch_size, hidden_size]
        last_hidden = paddle.reshape(last_hidden[-1], shape=[-1, self.hidden_size])

        # 对应以上第4步,将每个句子的向量表示映射到具体的情感类别上, logits的维度为[batch_size, 2]
        logits = self.cls_fc(last_hidden)
        
        return logits
    
# 定义训练参数
epoch_num = 5
batch_size = 128 

learning_rate = 0.0005
dropout_rate = 0.1
num_layers = 1
hidden_size = 256
embedding_size = 256
max_seq_len =  128
vocab_size = len(word2id_freq)

# 检测是否可以使用GPU,如果可以优先使用GPU
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')

# 实例化模型
sentiment_classifier = SentimentClassifier(hidden_size, vocab_size, embedding_size,  num_steps=max_seq_len, num_layers=num_layers, dropout_rate=dropout_rate)

# 指定优化策略,更新模型参数
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= sentiment_classifier.parameters()) 

# 定义训练函数
# 记录训练过程中的损失变化情况,可用于后续画图查看训练情况
losses = []
steps = []

def train(model):
    # 开启模型训练模式
    model.train()
    
    # 建立训练数据生成器,每次迭代生成一个batch,每个batch包含训练文本和文本对应的情感标签
    # word2id_dict 词典, train_corpus 训练数据的向量表示, batch_size 每个批的大小, epoch_num 训练的次数, max_seq_len最大输入的长度
    train_loader = build_batch(word2id_dict, train_corpus, batch_size, epoch_num, max_seq_len)
    #step:迭代的次数
    for step, (sentences, labels) in enumerate(train_loader):
        # 获取数据,并将张量转换为Tensor类型
        sentences = paddle.to_tensor(sentences)
        labels = paddle.to_tensor(labels)
        one_hot_label = fluid.layers.one_hot(input=labels, depth=11)

        
        # 前向计算,将数据feed进模型,并得到预测的情感标签和损失
        logits = model(sentences)
        #print(labels, "--->", logits, "--->", one_hot_label)

        # 计算损失
        loss = F.cross_entropy(input=logits, label=one_hot_label, soft_label=True)
        loss = paddle.mean(loss)
		
	# 后向传播
        loss.backward()
        # 更新参数
        optimizer.step()
        # 清除梯度
        optimizer.clear_grad()
        # 记录当前步骤的loss变化情况
        losses.append(loss.numpy()[0])
        steps.append(step)
        # 打印当前loss数值
        print("step %d, loss %.3f" % (step, loss.numpy()[0]))

#训练模型
train(sentiment_classifier)