用paddlepaddle的LSTM进行文本分类的代码
# encoding=utf8
import re
import random
import tarfile
import requests
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
from paddle.nn import LSTM, Embedding, Dropout, Linear
import paddle.fluid as fluid
import jieba
# 加载数据
def load_imdb():
data_set = []
index = 0
label_dic = {}
with open("thuc_data/cnews.val.txt", "rb") as file:
for line in file:
line_1 = line.decode("utf8","ignore")
index += 1
ll = line_1.strip().split("\t")
if(len(ll)!=2):
continue
sentence_label = ll[0]
content = ll[1]
# 使用jieba分词对文本进行分词
sentence = jieba.cut(content, cut_all=False)
data_set.append((" ".join(sentence), sentence_label))
return data_set
train_corpus = load_imdb()
for i in range(100):
print("sentence %d, %d" % (i, len(train_corpus[i][0].split(" "))))
print("sentence %d, label %s" % (i, train_corpus[i][1]))
# 构造词典,统计每个词的频率,并根据频率将每个词转换为一个整数id
def build_dict(corpus):
word_freq_dict = dict()
for sentence, _ in corpus:
for word in sentence.split(" "):
if word not in word_freq_dict:
word_freq_dict[word] = 0
word_freq_dict[word] += 1
word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
word2id_dict = dict()
word2id_freq = dict()
# 一般来说,我们把oov和pad放在词典前面,给他们一个比较小的id,这样比较方便记忆,并且易于后续扩展词表
word2id_dict['[oov]'] = 0
word2id_freq[0] = 1e10
word2id_dict['[pad]'] = 1
word2id_freq[1] = 1e10
for word, freq in word_freq_dict:
word2id_dict[word] = len(word2id_dict)
word2id_freq[word2id_dict[word]] = freq
return word2id_freq, word2id_dict
word2id_freq, word2id_dict = build_dict(train_corpus)
vocab_size = len(word2id_freq)
print("there are totoally %d different words in the corpus" % vocab_size)
for _, (word, word_id) in zip(range(10), word2id_dict.items()):
print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))
# 把语料转换为id序列
def convert_corpus_to_id(corpus, word2id_dict):
data_set = []
label_dict = {
"体育":1,
"娱乐":2,
"家居":3,
"房产":4,
"教育":5,
"时尚":6,
"时政":7,
"游戏":8,
"科技":9,
"财经":10
}
for sentence, sentence_label in corpus:
label = label_dict.get(sentence_label, -1)
# 将句子中的词逐个替换成id,如果句子中的词不在词表内,则替换成oov
# 这里需要注意,一般来说我们可能需要查看一下test-set中,句子oov的比例,
# 如果存在过多oov的情况,那就说明我们的训练数据不足或者切分存在巨大偏差,需要调整
sentence = [word2id_dict[word] if word in word2id_dict \
else word2id_dict['[oov]'] for word in sentence.split(" ")]
data_set.append((sentence, label))
return data_set
train_corpus = convert_corpus_to_id(train_corpus, word2id_dict)
print("%d tokens in the corpus" % len(train_corpus))
print(train_corpus[:5])
# 编写一个迭代器,每次调用这个迭代器都会返回一个新的batch,用于训练或者预测
def build_batch(word2id_dict, corpus, batch_size, epoch_num, max_seq_len, shuffle = True, drop_last = True):
# 模型将会接受的两个输入:
# 1. 一个形状为[batch_size, max_seq_len]的张量,sentence_batch,代表了一个mini-batch的句子。
# 2. 一个形状为[batch_size, 1]的张量,sentence_label_batch,每个元素都是非0即1,代表了每个句子的情感类别(正向或者负向)
sentence_batch = []
sentence_label_batch = []
for _ in range(epoch_num):
#每个epoch前都shuffle一下数据,有助于提高模型训练的效果
#但是对于预测任务,不要做数据shuffle
if shuffle:
random.shuffle(corpus)
for sentence, sentence_label in corpus:
sentence_sample = sentence[:min(max_seq_len, len(sentence))]
if len(sentence_sample) < max_seq_len:
for _ in range(max_seq_len - len(sentence_sample)):
sentence_sample.append(word2id_dict['[pad]'])
sentence_sample = [[word_id] for word_id in sentence_sample]
sentence_batch.append(sentence_sample)
sentence_label_batch.append([sentence_label])
if len(sentence_batch) == batch_size:
yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
sentence_batch = []
sentence_label_batch = []
if not drop_last and len(sentence_batch) > 0:
yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
for batch_id, batch in enumerate(build_batch(word2id_dict, train_corpus, batch_size=10, epoch_num=1, max_seq_len=30)):
print(batch_id, batch)
# 定义一个用于情感分类的网络实例,SentimentClassifier
class SentimentClassifier(paddle.nn.Layer):
def __init__(self, hidden_size, vocab_size, embedding_size, class_num=11, num_steps=128, num_layers=1, init_scale=0.1, dropout_rate=None):
# 参数含义如下:
# 1.hidden_size,表示embedding-size,hidden和cell向量的维度
# 2.vocab_size,模型可以考虑的词表大小
# 3.embedding_size,表示词向量的维度
# 4.class_num,情感类型个数,可以是2分类,也可以是多分类
# 5.num_steps,表示这个情感分析模型最大可以考虑的句子长度
# 6.num_layers,表示网络的层数
# 7.dropout_rate,表示使用dropout过程中失活的神经元比例
# 8.init_scale,表示网络内部的参数的初始化范围,长短时记忆网络内部用了很多Tanh,Sigmoid等激活函数,\
# 这些函数对数值精度非常敏感,因此我们一般只使用比较小的初始化范围,以保证效果
super(SentimentClassifier, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.class_num = class_num
self.num_steps = num_steps
self.num_layers = num_layers
self.dropout_rate = dropout_rate
self.init_scale = init_scale
# 声明一个LSTM模型,用来把每个句子抽象成向量
self.simple_lstm_rnn = paddle.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
# 声明一个embedding层,用来把句子中的每个词转换为向量
self.embedding = paddle.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, sparse=False,
weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(low=-init_scale, high=init_scale)))
# 声明使用上述语义向量映射到具体情感类别时所需要使用的线性层
self.cls_fc = paddle.nn.Linear(in_features=self.hidden_size, out_features=self.class_num,
weight_attr=None, bias_attr=None)
# 一般在获取单词的embedding后,会使用dropout层,防止过拟合,提升模型泛化能力
self.dropout_layer = paddle.nn.Dropout(p=self.dropout_rate, mode='upscale_in_train')
# forwad函数即为模型前向计算的函数,它有两个输入,分别为:
# input为输入的训练文本,其shape为[batch_size, max_seq_len]
# label训练文本对应的情感标签,其shape维[batch_size, 1]
def forward(self, inputs):
# 获取输入数据的batch_size
batch_size = inputs.shape[0]
# 本实验默认使用1层的LSTM,首先我们需要定义LSTM的初始hidden和cell,这里我们使用0来初始化这个序列的记忆
init_hidden_data = np.zeros(
(self.num_layers, batch_size, self.hidden_size), dtype='float32')
init_cell_data = np.zeros(
(self.num_layers, batch_size, self.hidden_size), dtype='float32')
# 将这些初始记忆转换为飞桨可计算的向量,并且设置stop_gradient=True,避免这些向量被更新,从而影响训练效果
init_hidden = paddle.to_tensor(init_hidden_data)
init_hidden.stop_gradient = True
init_cell = paddle.to_tensor(init_cell_data)
init_cell.stop_gradient = True
# 对应以上第2步,将输入的句子的mini-batch转换为词向量表示,转换后输入数据shape为[batch_size, max_seq_len, embedding_size]
x_emb = self.embedding(inputs)
x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.embedding_size])
# 在获取的词向量后添加dropout层
if self.dropout_rate is not None and self.dropout_rate > 0.0:
x_emb = self.dropout_layer(x_emb)
# 对应以上第3步,使用LSTM网络,把每个句子转换为语义向量
# 返回的last_hidden即为最后一个时间步的输出,其shape为[self.num_layers, batch_size, hidden_size]
rnn_out, (last_hidden, last_cell) = self.simple_lstm_rnn(x_emb, (init_hidden, init_cell))
# 提取最后一层隐状态作为文本的语义向量,其shape为[batch_size, hidden_size]
last_hidden = paddle.reshape(last_hidden[-1], shape=[-1, self.hidden_size])
# 对应以上第4步,将每个句子的向量表示映射到具体的情感类别上, logits的维度为[batch_size, 2]
logits = self.cls_fc(last_hidden)
return logits
# 定义训练参数
epoch_num = 5
batch_size = 128
learning_rate = 0.0005
dropout_rate = 0.1
num_layers = 1
hidden_size = 256
embedding_size = 256
max_seq_len = 128
vocab_size = len(word2id_freq)
# 检测是否可以使用GPU,如果可以优先使用GPU
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
paddle.set_device('gpu:0')
# 实例化模型
sentiment_classifier = SentimentClassifier(hidden_size, vocab_size, embedding_size, num_steps=max_seq_len, num_layers=num_layers, dropout_rate=dropout_rate)
# 指定优化策略,更新模型参数
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= sentiment_classifier.parameters())
# 定义训练函数
# 记录训练过程中的损失变化情况,可用于后续画图查看训练情况
losses = []
steps = []
def train(model):
# 开启模型训练模式
model.train()
# 建立训练数据生成器,每次迭代生成一个batch,每个batch包含训练文本和文本对应的情感标签
# word2id_dict 词典, train_corpus 训练数据的向量表示, batch_size 每个批的大小, epoch_num 训练的次数, max_seq_len最大输入的长度
train_loader = build_batch(word2id_dict, train_corpus, batch_size, epoch_num, max_seq_len)
#step:迭代的次数
for step, (sentences, labels) in enumerate(train_loader):
# 获取数据,并将张量转换为Tensor类型
sentences = paddle.to_tensor(sentences)
labels = paddle.to_tensor(labels)
one_hot_label = fluid.layers.one_hot(input=labels, depth=11)
# 前向计算,将数据feed进模型,并得到预测的情感标签和损失
logits = model(sentences)
#print(labels, "--->", logits, "--->", one_hot_label)
# 计算损失
loss = F.cross_entropy(input=logits, label=one_hot_label, soft_label=True)
loss = paddle.mean(loss)
# 后向传播
loss.backward()
# 更新参数
optimizer.step()
# 清除梯度
optimizer.clear_grad()
# 记录当前步骤的loss变化情况
losses.append(loss.numpy()[0])
steps.append(step)
# 打印当前loss数值
print("step %d, loss %.3f" % (step, loss.numpy()[0]))
#训练模型
train(sentiment_classifier)