1.KMeans文本聚类算法
1.1 文本聚类概述
在NLP领域,一个很重要的应用方向是文本聚类,文本聚类有很多种算法,例如KMeans、DBScan、BIRCH、CURE等。这里我们着重介绍最经典的KMeans算法。KMeans算法是一种无监督学习的算法,它解决的是聚类问题。将一些数据通过无监督的方式,自动化聚集出一些簇。文本聚类存在大量的使用场景,比如数据挖掘、信息检索、主题检测、文本概况等。
文本聚类对文档集合进行划分,使得同类别的文档聚合到一起,不同类别的文档相似度比较小。文本聚类不需要预先对文档进行标记,具有高度的自动化能力。
算法介绍参数k,然后将事先输入的n个数据对象划分为k个聚类以便使得所获得的聚类满足聚类中的对象相似度较高,而不同聚类中的对象相似度较小。
1.2 KMeans算法原理
算法思想:以空间中 k 个点为中心进行聚类,对最靠近他们的对象归类,通过迭代的方法,逐次更新各聚类中心的值,直到得到最好的聚类结果。
算法描述:
- 适当选择 k 个类的初始中心。
- 在第 n 次迭代中,对任意一个样本求其到 k 个中心的距离,将样本归到距离最短的那个中心所在的类。
- 利用均值等方法更新该类的中心值。
- 对于所有的 k 个聚类中心,如果利用上述 2 和 3 的迭代法更新后,值保持不变,则迭代结束;否则继续迭代。
1.3 初始聚类点的选择
初始的聚类点对后续的最终划分有非常大的影响,选择合适的初始点,可以加快算法的收敛速度和增强类之间的区分度。
选择初始聚类点的方法有如下几种:
- 随机选择法:随机的选择 k 个对象作为初始聚类点。
- 最小最大法:先选择所有对象中的相距最遥远的两个对象作为聚类点。然后选择第三个点,使得它与确定的聚类点的最小距离是所有点中最大的,然后按照相同的原则选取。
- 最小距离法:选择一个正数 r,把所有对象的中心作为第一个聚类点,然后依次输入对象,当前输入对象与已确认的聚类点的距离都大于 r 时,则该对象作为一个新的聚类点。
- 最近归类法:划分方法就是决定当前对象应该分到哪个簇中,划分方法中最为流行的是最近归类法,即将当前对象归类于最近的聚类点。
2.爬取豆瓣读书数据
2.1 爬取页面
2.2 爬虫代码
import ssl
import bs4
import re
import requests
import csv
import codecs
import time
class DouBanSpider:
def __init__(self):
self.userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
self.headers = {"User-Agent": self.userAgent}
# 拿到豆瓣图书的分类标签
def getBookCategories(self):
try:
url = "https://book.douban.com/tag/?view=type&icn=index-sorttags-all"
response = requests.get(url, headers=self.headers)
content = response.text
return content
except:
print("error")
return None
# 找到每个标签的内容
def getCategroiesContent(self):
content = self.getBookCategories()
if not content:
print("页面抓取失败...")
return None
soup = bs4.BeautifulSoup(content, 'lxml')
categroyMatch = re.compile(r"^/tag/*")
categroies = []
for category in soup.find_all("a", {"href": categroyMatch}):
if category:
categroies.append(category.string)
return categroies
# 拿到每个标签的链接
def getCategoryLink(self):
categories = self.getCategroiesContent()
categoryLinks = []
for item in categories:
link = "https://book.douban.com/tag/" + str(item)
categoryLinks.append(link)
return categoryLinks
# 爬取书目信息
def getBookInfo(self, categroyLinks):
self.setCsvTitle()
categroies = categroyLinks
try:
for link in categroies:
print("正在爬取:" + link)
bookList = []
response = requests.get(link, headers=self.headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')
bookCategroy = soup.h1.string
for book in soup.find_all("li", {"class": "subject-item"}):
bookSoup = bs4.BeautifulSoup(str(book), 'lxml')
bookTitle = bookSoup.h2.a["title"]
bookAuthor = bookSoup.find("div", {"class": "pub"})
bookComment = bookSoup.find("span", {"class": "pl"})
bookContent = bookSoup.li.p
if bookTitle and bookAuthor and bookComment and bookContent:
bookList.append([bookTitle.strip(),bookCategroy.strip() , bookAuthor.string.strip(),bookComment.string.strip(), bookContent.string.strip()])
self.saveBookInfo(bookList)
time.sleep(3)
print("爬取结束...")
except:
print("error")
return None
# 保存表头
def setCsvTitle(self):
csvFile = codecs.open("data/data.csv", "a", "utf_8_sig")
try:
writer = csv.writer(csvFile)
writer.writerow(["title", "tag", "info", "comments", "content"])
finally:
csvFile.close()
# 保存书本信息
def saveBookInfo(self, bookList):
bookList = bookList
csvFile = codecs.open("data/data.csv", "a", "utf_8_sig")
try:
writer = csv.writer(csvFile)
for book in bookList:
writer.writerow(book)
finally:
csvFile.close()
# 启动
def start(self):
categoryLink = self.getCategoryLink()
self.getBookInfo(categoryLink)
douBanSpider = DouBanSpider()
douBanSpider.start()
2.3 爬取结果
3.豆瓣读书文本聚类
3.1 数据标准化类
import re
import string
import jieba
class Normalization:
def __init__(self):
with open("dict/stop_words.utf8", encoding="utf8") as f:
self.stopword_list = f.readlines()
def tokenize_text(self, text):
tokens = jieba.lcut(text) # 分词
tokens = [token.strip() for token in tokens] # 去除空格
return tokens
def remove_special_characters(self, text):
tokens = self.tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_stopwords(self, text):
tokens = self.tokenize_text(text)
filtered_tokens = [token for token in tokens if token not in self.stopword_list]
filtered_text = ''.join(filtered_tokens)
return filtered_text
def normalize_corpus(self, corpus):
normalized_corpus = []
for text in corpus:
text = ' '.join(jieba.lcut(text))
normalized_corpus.append(text)
return normalized_corpus
normalization = Normalization()
3.2 载入数据
book_data = pd.read_csv('data/data.csv')
book_data.head()
3.3 处理数据
# 提取标题和内容
book_titles = book_data['title'].tolist()
book_content = book_data['content'].tolist()
# 标准化语料库
norm_book_content = normalization.normalize_corpus(book_content)
# 提取 tf-idf 特征
vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
feature_type='tfidf',
min_df=0.2, max_df=0.9,
ngram_range=(1, 2))
# 查看特征数量
print(feature_matrix.shape)
>>>(2823, 15399)
# 获取特征名字
feature_names = vectorizer.get_feature_names()
# 打印某些特征
print(feature_names[:60])
>>>['000', '005', '01', '037', '09', '10', '100', '1000', '100000', '1000000', '100ghz', '1080', '108758', '10w', '11', '110', '114', '12', '120', '1200', '122', '124', '1240', '13', '130', '1344', '1345', '135', '1364', '137', '14', '15', '150', '1500', '150cm', '155', '156', '1587', '16', '1600', '1602', '1644', '1662', '1675', '17', '1704', '1768', '1781', '1793', '18', '1810', '1812', '1832', '1840', '1841', '1850', '1861', '1869', '1870', '1881']
print(feature_names[-60:])
>>>['黄易', '黄暖心', '黄有璨', '黄梓', '黄永玉', '黄灯', '黄玫瑰', '黄蓉', '黄金', '黄金时代', '黄霖', '黄鹂', '黎万强', '黑人', '黑仔', '黑塞', '黑夜', '黑客', '黑客帝国', '黑斯廷斯', '黑暗', '黑暗时代', '黑板报', '黑格尔', '黑洞', '黑烟', '黑白', '黑石', '黑箱', '黑色', '黑色幽默', '黑镜', '黑马', '黔西', '默奇', '默默', '默默地', '默默无闻', '黛玉', '點滴', '黩武', '黯淡', '黯然', '鼓励', '鼓声', '鼓手', '鼓语', '齐名', '齐后', '齐欢', '齐铭则', '龙之介', '龙凤', '龙华', '龙应台', '龙战', '龙有', '龙的传人', '龙神', '龚古尔']
3.4 文本聚类
from sklearn.cluster import KMeans
def k_means(feature_matrix, num_clusters=10):
km = KMeans(n_clusters=num_clusters, max_iter=10000)
km.fit(feature_matrix)
clusters = km.labels_
return km, clusters
num_clusters = 10
km_obj, clusters = k_means(feature_matrix=feature_matrix, num_clusters=num_clusters)
book_data['Cluster'] = clusters
book_data.head()
from collections import Counter
# 获取每个cluster的数量
c = Counter(clusters)
print(c.items())
>>>dict_items([(4, 323), (3, 102), (5, 370), (7, 227), (8, 190), (2, 238), (0, 1214), (6, 107), (1, 18), (9, 34)])
def get_cluster_data(clustering_obj, book_data, feature_names, num_clusters, topn_features=10):
cluster_details = {}
# 获取cluster的center
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
# 获取每个cluster的关键特征
# 获取每个cluster的书
for cluster_num in range(num_clusters):
cluster_details[cluster_num] = {}
cluster_details[cluster_num]['cluster_num'] = cluster_num
key_features = [feature_names[index]
for index
in ordered_centroids[cluster_num, :topn_features]]
cluster_details[cluster_num]['key_features'] = key_features
books = book_data[book_data['Cluster'] == cluster_num]['title'].values.tolist()
cluster_details[cluster_num]['books'] = books
return cluster_details
def print_cluster_data(cluster_data):
for cluster_num, cluster_details in cluster_data.items():
print('Cluster {} details: '.format(cluster_num))
print('-' * 20)
print('Key features: ', cluster_details['key_features'])
print('Book in this cluster: ')
print(', '.join(cluster_details['books']))
print('=' * 40)
cluster_data = get_cluster_data(clustering_obj=km_obj,
book_data=book_data,
feature_names=feature_names,
num_clusters=num_clusters,
topn_features=5)
print_cluster_data(cluster_data)