关于自然语言处理系列-文本摘要提取进阶

原创

baoqiangwang 2022-04-12 16:22:23 ©著作权

©著作权归作者所有：来自51CTO博客作者baoqiangwang的原创作品，请联系作者获取转载授权，否则将追究法律责任

关于自然语言处理重要的一个部分是文本摘要，文本摘要的提取涉及到分词、断句、文本权重问题；分词前文已述，断句通过正则表达式完成；文本权重又包括句子的tfidf权重、文本相似度权重和句子的位置权重；关于权重又涉及到归一化处理和权重的权值等等。总的来说提取的摘要质量要比之前的snownlp、sumy、goose直接拿来用效果要好一些。

相关代码来自互联网，不过自己做了一些优化和完善。

代码示例

# coding:utf-8
import jieba
import numpy as np
import collections
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import math
import re
# 分割语句，生成语句列表和语句顺序字典
def split_sentence(text, punctuation_list=r'([\s\.\!\?\。\！\？]+)'):
# 将文章按照标点符号列表里的符号切分成句子，将所有句子保存在列表里；同时生成一份带句子顺序的字典
# 正则表达式分割中文文本
sentence_set = re.split(punctuation_list, text)
# 追加一个空标志
sentence_set.append("")
# 将分割后的字符串添加回原来的标点符号
sentence_set = ["".join(i) for i in zip(sentence_set[0::2], sentence_set[1::2])]
sentence_with_index = dict(zip(range(len(sentence_set)), sentence_set))
# 返回语句列表和带语句顺序的字典
return sentence_set, sentence_with_index
# 计算语句列表中每个词的tfidf值
def get_tfidf_matrix(sentence_set, stop_word):
corpus = []
# 对每条语句进行分词，并且去掉停用词，写入corpus列表
for sent in sentence_set:
sent_cut = jieba.cut(sent)
sent_list = [word for word in sent_cut if word not in stop_word]
sent_str = ' '.join(sent_list)
corpus.append(sent_str)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
# CountVectorizer.fit_transform将文本进行词袋处理
# TfidfTransformer.fit_transform 用于统计vectorizer中每个词语的TF-IDF值。
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word=vectorizer.get_feature_names()
tfidf_matrix = tfidf.toarray()
# 返回tfidf矩阵
return np.array(tfidf_matrix)
# 基于tfidf对各行语句求权重
def get_sentence_with_words_weight(tfidf_matrix):
# 对tfidf_matrix值求和
tfidf_matrix_sum = tfidf_matrix.sum(1)
# 转换矩阵维度，进行归一化处理
tfidf_matrix_sum = np.reshape(tfidf_matrix_sum,(-1, 1))
min_max_scaler = preprocessing.MinMaxScaler()
tfidf_matrix_sum = min_max_scaler.fit_transform(tfidf_matrix_sum)
# 归一化处理后，将二维转一维再转list
tfidf_list_sum=tfidf_matrix_sum.flatten().tolist()
# 将list转为当前行对应的tfidf值
sentence_with_words_weight = dict(zip(range(len(tfidf_list_sum)),tfidf_list_sum ))
return sentence_with_words_weight
# 计算各语句的位置权重
def get_sentence_with_position_weight(sentence_set):
# 线性处理各语句位置权重，会导致后面的语句被忽视，这里做了对数规约化处理
sentence_with_position_weight = {}
total_sent = len(sentence_set)
for i in range(total_sent):
#sentence_with_position_weight[i] = (total_sent - i) / total_sent
sentence_with_position_weight[i]=math.log(total_sent-i,10)
return sentence_with_position_weight
# 计算余弦相似度返回值比较
def similarity(sent1, sent2):
# 计算余弦相似度
return np.sum(sent1 * sent2) / 1e-6 + (np.sqrt(np.sum(sent1 * sent1)) * \
np.sqrt(np.sum(sent2 * sent2)))
# 计算相似度权重
def get_similarity_weight(tfidf_matrix):
sentence_score = collections.defaultdict(lambda: 0.)
# 遍历构建各语句之间的相似度，累加后，生成语句间相似度字典
for i in range(len(tfidf_matrix)):
score_i = 0.
for j in range(len(tfidf_matrix)):
score_i += similarity(tfidf_matrix[i], tfidf_matrix[j])
sentence_score[i] = score_i
# 进行归一化处理
max_score = max(sentence_score.values()) # 归一化
min_score = min(sentence_score.values())
for key in sentence_score.keys():
x = sentence_score[key]
sentence_score[key] = (x - min_score) / (max_score - min_score)
return sentence_score
# 基于权重和得分生成总权重值
def ranking_base_on_weigth(sentence_with_words_weight,
sentence_with_position_weight,
sentence_score, feature_weight=[1, 1, 1]):
sentence_weight = collections.defaultdict(lambda: 0.)
# 遍历各语句的tfidf值、语句顺序值、相似度值，并乘以相应的权重，加权后得到每条语句的总权重值
for sent in sentence_score.keys():
sentence_weight[sent] = feature_weight[0] * sentence_with_words_weight[sent] + \
feature_weight[1] * sentence_with_position_weight[sent] + \
feature_weight[2] * sentence_score[sent]
sort_sent_weight = sorted(sentence_weight.items(), key=lambda d: d[1], reverse=True)
return sort_sent_weight
# 基于各语句总权重值和摘要比例，从语句中挑选相关摘要
def get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3):
topK = int(len(sort_sent_weight) * topK_ratio)
# 按各语句的权重值进行排序，并获取topN条数据
summarization_sent = sorted([sent[0] for sent in sort_sent_weight[:topK]])
# 通过语句索引找相关语句拼装回去
summarization = []
for i in summarization_sent:
summarization.append(sentence_with_index[i])
summary = ''.join(summarization)
return summary
if __name__ == '__main__':
stopwordfile= 'C:\Python\Pycharm\langprocess\\stopwords.txt'
test_text = 'C:\Python\Pycharm\langprocess\\train\C4-Literature\C4-Literature02.txt'
test_text = 'C:\Python\Pycharm\langprocess\\背影.txt'
#test_text = 'C:\Python\Pycharm\langprocess\\第一章.txt'
# 读取待做摘要的文章内容
with open(test_text, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
# 读取停用词词典
stop_word = []
with open(stopwordfile, 'r', encoding='utf-8') as f:
for line in f.readlines():
stop_word.append(line.strip())
# 返回断句后的语句和索引语句字典
sentence_set, sentence_with_index = split_sentence(text, punctuation_list=r'([\s\.\!\?\。\！\？]+)')
# 返回各语句各分词的tfidf矩阵
tfidf_matrix = get_tfidf_matrix(sentence_set, stop_word)
# 根据tfidf矩阵，生成该语句的tfidf值
sentence_with_words_weight = get_sentence_with_words_weight(tfidf_matrix)
# 生成语句的位置权重值
sentence_with_position_weight = get_sentence_with_position_weight(sentence_set)
# 根据tfidf矩阵，进行文本相似度计算，生成各语句的文本相似度值
sentence_score = get_similarity_weight(tfidf_matrix)
# 将tfidf值、位置权重值、文本相似度值按照相关权重进行计算，返回总的权重值
sort_sent_weight = ranking_base_on_weigth(sentence_with_words_weight,
sentence_with_position_weight,
sentence_score, feature_weight=[1, 0.01, 1])
# 根据权重值、摘要比例生成摘要
summarization = get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.2)
print('摘要:\n', summarization)