每个文本文件包含相对应类的数据(0:喜悦1:愤怒2:厌恶3:低落对应不同类别的感情)
- 文本读取
- 用均值的方差,在高斯分布里面计算某个词的概率。
- 对文本特征进行提取,提取词频。
- 通过词频在各种词频目录里面进行匹配。
- 对模型的准确率的预测。
main.py
1 # -*- coding: utf-8 -*-
2 import os
3 import pandas as pd
4 import nltk
5 from tools import proc_text, split_train_test, get_word_list_from_data, \
6 extract_feat_from_data, cal_acc
7 from nltk.text import TextCollection
8 from sklearn.naive_bayes import GaussianNB
9
10 dataset_path = './dataset'
11 text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
12 '2_simplifyweibo.txt', '3_simplifyweibo.txt']
13
14 # 原始数据的csv文件
15 output_text_filename = 'raw_weibo_text.csv'
16
17 # 清洗好的文本数据文件
18 output_cln_text_filename = 'clean_weibo_text.csv'
19
20 # 处理和清洗文本数据的时间较长,通过设置is_first_run进行配置
21 # 如果是第一次运行需要对原始文本数据进行处理和清洗,需要设为True
22 # 如果之前已经处理了文本数据,并已经保存了清洗好的文本数据,设为False即可
23 is_first_run = True
24
25
26 def read_and_save_to_csv():
27 """
28 读取原始文本数据,将标签和文本数据保存成csv
29 """
30
31 text_w_label_df_lst = []
32 for text_filename in text_filenames:
33 text_file = os.path.join(dataset_path, text_filename)
34
35 # 获取标签,即0, 1, 2, 3
36 label = int(text_filename[0])
37
38 # 读取文本文件
39 with open(text_file, 'r', encoding='utf-8') as f:
40 lines = f.read().splitlines()
41
42 labels = [label] * len(lines)
43
44 text_series = pd.Series(lines)
45 label_series = pd.Series(labels)
46
47 # 构造dataframe
48 text_w_label_df = pd.concat([label_series, text_series], axis=1)
49 text_w_label_df_lst.append(text_w_label_df)
50
51 result_df = pd.concat(text_w_label_df_lst, axis=0)
52
53 # 保存成csv文件
54 result_df.columns = ['label', 'text']
55 result_df.to_csv(os.path.join(dataset_path, output_text_filename),
56 index=None, encoding='utf-8')
57
58
59 def run_main():
60 """
61 主函数
62 """
63 # 1. 数据读取,处理,清洗,准备
64 if is_first_run:
65 print('处理清洗文本数据中...', end=' ')
66 # 如果是第一次运行需要对原始文本数据进行处理和清洗
67
68 # 读取原始文本数据,将标签和文本数据保存成csv
69 read_and_save_to_csv()
70
71 # 读取处理好的csv文件,构造数据集
72 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
73 encoding='utf-8')
74
75 # 处理文本数据
76 text_df['text'] = text_df['text'].apply(proc_text)
77
78 # 过滤空字符串
79 text_df = text_df[text_df['text'] != '']
80
81 # 保存处理好的文本数据
82 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
83 index=None, encoding='utf-8')
84 print('完成,并保存结果。')
85
86 # 2. 分割训练集、测试集
87 print('加载处理好的文本数据')
88 clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
89 encoding='utf-8')
90 # 分割训练集和测试集
91 train_text_df, test_text_df = split_train_test(clean_text_df)
92 # 查看训练集测试集基本信息
93 print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
94 print('测试集中各类的数据个数:', test_text_df.groupby('label').size())
95
96 # 3. 特征提取
97 # 计算词频
98 n_common_words = 200
99
100 # 将训练集中的单词拿出来统计词频
101 print('统计词频...')
102 all_words_in_train = get_word_list_from_data(train_text_df)
103 fdisk = nltk.FreqDist(all_words_in_train)
104 common_words_freqs = fdisk.most_common(n_common_words)
105 print('出现最多的{}个词是:'.format(n_common_words))
106 for word, count in common_words_freqs:
107 print('{}: {}次'.format(word, count))
108 print()
109
110 # 在训练集上提取特征
111 text_collection = TextCollection(train_text_df['text'].values.tolist())
112 print('训练样本提取特征...', end=' ')
113 train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
114 print('完成')
115 print()
116
117 print('测试样本提取特征...', end=' ')
118 test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
119 print('完成')
120
121 # 4. 训练模型Naive Bayes
122 print('训练模型...', end=' ')
123 gnb = GaussianNB()
124 gnb.fit(train_X, train_y)
125 print('完成')
126 print()
127
128 # 5. 预测
129 print('测试模型...', end=' ')
130 test_pred = gnb.predict(test_X)
131 print('完成')
132
133 # 输出准确率
134 print('准确率:', cal_acc(test_y, test_pred))
135
136 if __name__ == '__main__':
137 run_main()
tools.py
1 # -*- coding: utf-8 -*-
2 import re
3 import jieba.posseg as pseg
4 import pandas as pd
5 import math
6 import numpy as np
7
8 # 加载常用停用词
9 stopwords1 = [line.rstrip() for line in open('./中文停用词库.txt', 'r', encoding='utf-8')]
10 # stopwords2 = [line.rstrip() for line in open('./哈工大停用词表.txt', 'r', encoding='utf-8')]
11 # stopwords3 = [line.rstrip() for line in open('./四川大学机器智能实验室停用词库.txt', 'r', encoding='utf-8')]
12 # stopwords = stopwords1 + stopwords2 + stopwords3
13 stopwords = stopwords1
14
15
16 def proc_text(raw_line):
17 """
18 处理每行的文本数据
19 返回分词结果
20 """
21 # 1. 使用正则表达式去除非中文字符
22 filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
23 chinese_only = filter_pattern.sub('', raw_line)
24
25 # 2. 结巴分词+词性标注
26 words_lst = pseg.cut(chinese_only)
27
28 # 3. 去除停用词
29 meaninful_words = []
30 for word, flag in words_lst:
31 # if (word not in stopwords) and (flag == 'v'):
32 # 也可根据词性去除非动词等
33 if word not in stopwords:
34 meaninful_words.append(word)
35
36 return ' '.join(meaninful_words)
37
38
39 def split_train_test(text_df, size=0.8):
40 """
41 分割训练集和测试集
42 """
43 # 为保证每个类中的数据能在训练集中和测试集中的比例相同,所以需要依次对每个类进行处理
44 train_text_df = pd.DataFrame()
45 test_text_df = pd.DataFrame()
46
47 labels = [0, 1, 2, 3]
48 for label in labels:
49 # 找出label的记录
50 text_df_w_label = text_df[text_df['label'] == label]
51 # 重新设置索引,保证每个类的记录是从0开始索引,方便之后的拆分
52 text_df_w_label = text_df_w_label.reset_index()
53
54 # 默认按80%训练集,20%测试集分割
55 # 这里为了简化操作,取前80%放到训练集中,后20%放到测试集中
56 # 当然也可以随机拆分80%,20%(尝试实现下DataFrame中的随机拆分)
57
58 # 该类数据的行数
59 n_lines = text_df_w_label.shape[0]
60 split_line_no = math.floor(n_lines * size)
61 text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
62 text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]
63
64 # 放入整体训练集,测试集中
65 train_text_df = train_text_df.append(text_df_w_label_train)
66 test_text_df = test_text_df.append(text_df_w_label_test)
67
68 train_text_df = train_text_df.reset_index()
69 test_text_df = test_text_df.reset_index()
70 return train_text_df, test_text_df
71
72
73 def get_word_list_from_data(text_df):
74 """
75 将数据集中的单词放入到一个列表中
76 """
77 word_list = []
78 for _, r_data in text_df.iterrows():
79 word_list += r_data['text'].split(' ')
80 return word_list
81
82
83 def extract_feat_from_data(text_df, text_collection, common_words_freqs):
84 """
85 特征提取
86 """
87 # 这里只选择TF-IDF特征作为例子
88 # 可考虑使用词频或其他文本特征作为额外的特征
89
90 n_sample = text_df.shape[0]
91 n_feat = len(common_words_freqs)
92 common_words = [word for word, _ in common_words_freqs]
93
94 # 初始化
95 X = np.zeros([n_sample, n_feat])
96 y = np.zeros(n_sample)
97
98 print('提取特征...')
99 for i, r_data in text_df.iterrows():
100 if (i + 1) % 5000 == 0:
101 print('已完成{}个样本的特征提取'.format(i + 1))
102
103 text = r_data['text']
104
105 feat_vec = []
106 for word in common_words:
107 if word in text:
108 # 如果在高频词中,计算TF-IDF值
109 tf_idf_val = text_collection.tf_idf(word, text)
110 else:
111 tf_idf_val = 0
112
113 feat_vec.append(tf_idf_val)
114
115 # 赋值
116 X[i, :] = np.array(feat_vec)
117 y[i] = int(r_data['label'])
118
119 return X, y
120
121
122 def cal_acc(true_labels, pred_labels):
123 """
124 计算准确率
125 """
126 n_total = len(true_labels)
127 correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]
128
129 acc = sum(correct_list) / n_total
130 return acc