#中文分词
def cut_word(text):
text=" ".join(list(jieba.cut(text)))
return text
#中文文本的特征提取
def count_chinese_dome():
data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。"
"19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”,"
"“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"]
data_new=[]
# for sent in data:
# data_new.append(cut_word(sent))
data_new=[cut_word(sent) for sent in data]
# 1.实例化一个转换器对象
transfer = CountVectorizer(stop_words=[]) # 停用词
# 2.调用fit_transform()
data_new_2 = transfer.fit_transform(data_new)
print(data_new_2.toarray())
print(transfer.get_feature_names())
print(data_new_2)
#Tf-idf文本特征抽取
def tfidf_demo():
data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。"
"19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”,"
"“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"]
data_new=[]
# for sent in data:
# data_new.append(cut_word(sent))
data_new=[cut_word(sent) for sent in data]
# 1.实例化一个转换器对象
transfer = TfidfVectorizer(stop_words=[]) # 停用词
# 2.调用fit_transform()
data_new_2 = transfer.fit_transform(data_new)
print(data_new_2.toarray())
print(transfer.get_feature_names())