文章目录
- 一、项目目录与地址
- 二、Alink 中文情感分析:微博评论情感分析
- 三、HanLP 中文情感分析
- 四、SparkML 中文情感分类(待定)
一、项目目录与地址
Github:https://github.com/Zhifa-Liu/EmotionClassDemo
- cn.edu.neu.alink:Alink 中文情感分析
- cn.edu.neu.bayes:在 https://github.com/marwincn/pubsenti-finder 代码基础上,略作修改后的贝叶斯情感分类,效果似乎不太好,不予介绍
- cn.edu.neu.hanlp:HanLP 中文情感分析
- cn.edu.neu.sparkml:SparkML 中文情感分析,待定
- cn.edu.neu.zoom.data:中文情感分析(文本分类)使用的数据集
- 中文情感挖掘语料-ChnSentiCorp(谭松波)
- 搜狗文本分类语料库迷你版
- 微博评论情感数据集:weibo_senti_100k.csv
- cn.edu.neu.zoom.model:保存的情感分析模型
数据集中文情感挖掘语料-ChnSentiCorp(谭松波)与搜狗文本分类语料库迷你版的下载链接可以从以下地址找到,另一个直接百度搜索即可:
https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90
二、Alink 中文情感分析:微博评论情感分析
package cn.edu.neu.alink;
import cn.edu.neu.alink.cons.ClassifierConstant;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp;
import com.alibaba.alink.operator.batch.source.TextSourceBatchOp;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.Pipeline;
import com.alibaba.alink.pipeline.PipelineModel;
import com.alibaba.alink.pipeline.classification.LogisticRegression;
import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier;
import com.alibaba.alink.pipeline.dataproc.Imputer;
import com.alibaba.alink.pipeline.nlp.DocCountVectorizer;
import com.alibaba.alink.pipeline.nlp.Segment;
import com.alibaba.alink.pipeline.nlp.StopWordsRemover;
import org.apache.flink.types.Row;
import java.io.File;
import java.util.List;
/**
* @author 32098
*/
public class CommentClassifier {
private static PipelineModel pipelineModel;
public static void initNaiveBayesModel(){
pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
if(pipelineModel==null){
System.err.println("载入模型失败...");
System.out.println("开始构建模型...");
BatchOperator<?> sourceBatchOp = getCommentSourceOp();
Pipeline pipeline = new Pipeline(
// 缺失值填充:null
new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
// 分词操作
new Segment().setSelectedCol("featureText"),
// 去除停用词
new StopWordsRemover().setSelectedCol("featureText"),
/*
* TF, Term Frequency: 词频,生成特征向量的类型
* https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
*/
new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
new NaiveBayesTextClassifier().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
);
pipelineModel = pipeline.fit(sourceBatchOp);
pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
try {
// save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
BatchOperator.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("模型构建成功!");
}
public static void initLogisticRegressionModel(){
pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
if(pipelineModel==null){
System.err.println("载入模型失败...");
System.out.println("开始构建模型...");
BatchOperator<?> sourceBatchOp = getCommentSourceOp();
Pipeline pipeline = new Pipeline(
// 缺失值填充:null
new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
// 分词操作
new Segment().setSelectedCol("featureText"),
// 去除停用词
new StopWordsRemover().setSelectedCol("featureText"),
/*
* TF, Term Frequency: 词频,生成特征向量的类型
* https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
*/
new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
new LogisticRegression().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
);
pipelineModel = pipeline.fit(sourceBatchOp);
pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
try {
// save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
BatchOperator.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("模型构建成功!");
}
private static BatchOperator<?> getCommentSourceOp(){
return new CsvSourceBatchOp()
.setFilePath(ClassifierConstant.DATASET_WEIBO_PATH)
.setSchemaStr("label int, review string")
.setIgnoreFirstLine(true);
}
public static String getClassification(String text){
if(pipelineModel==null){
System.err.println("As you didn't call initNaiveBayesModel() or initLogisticRegressionModel() before using getClassification(String text),\n" +
"we will call initNaiveBayesModel() to set value for our inner attribute (pipelineModel) to get your text's Classification");
initNaiveBayesModel();
}
try {
//
LocalPredictor localPredictor = pipelineModel.collectLocalPredictor("review string");
// System.out.print(localPredictor.getOutputSchema());
Row row = Row.of(text);
return String.valueOf(localPredictor.map(row).getField(3));
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) throws Exception {
// Can't, we will use LocalPredictor
// initNaiveBayesModel();
// System.out.println("------------------------------");
// TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
// .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
// .setTextCol("review");
// pipelineModel.transform(textSourceBatchOp1).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
//
// initLogisticRegressionModel();
// System.out.println("------------------------------");
// TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
// .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
// .setTextCol("review");
// pipelineModel.transform(textSourceBatchOp2).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
System.out.println(getClassification("你真好"));
System.out.println(getClassification("哇哦今年的春夏季衣服不错诶"));
TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
.setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
.setTextCol("review");
TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
.setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
.setTextCol("review");
List<Row> negRows = textSourceBatchOp1.getDataSet().collect();
List<Row> posRows = textSourceBatchOp2.getDataSet().collect();
int acc = 0;
for (Row negRow : negRows) {
// except to be 0
String text = getClassification((String) negRow.getField(0));
System.out.println(text);
if("0".equals(text)){
acc+=1;
}
}
for (Row posRow : posRows) {
// except to be 1
String text = getClassification((String) posRow.getField(0));
System.out.println(text);
if("0".equals(text)){
acc+=1;
}
}
System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size()));
}
}
这个分类感觉有点慢!!!
三、HanLP 中文情感分析
HanLP git:https://github.com/hankcs/HanLP/tree/doc-zh HanLP 中文情感分析:https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90
通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、文本分类:
package cn.edu.neu.hanlp;
import cn.edu.neu.hanlp.cons.ClassifierConstant;
import com.hankcs.hanlp.classification.classifiers.AbstractClassifier;
import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier;
import com.hankcs.hanlp.classification.corpus.FileDataSet;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.models.NaiveBayesModel;
import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer;
import java.io.*;
import java.util.Map;
/**
* @author 32098
*/
public class HanLpClassifier {
private static AbstractClassifier classifier = null;
/**
*
* @param dataPath 数据路径
* @param modelPath 模型路径
*/
public static void initClassifier(String dataPath, String modelPath){
AbstractModel model = loadModel(modelPath);
if(model==null){
System.out.println("No model find, begin train model!");
IDataSet dataSet = null;
try {
System.out.println(dataPath);
File f = new File(dataPath);
if(f.isFile()){
BufferedReader reader = new BufferedReader(new FileReader(dataPath));
String str;
dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer());
System.out.println("Prepare dataset!");
// ignore first line
str = reader.readLine();
while ((str=reader.readLine())!=null){
dataSet.add(str.substring(0,1), str.substring(2));
}
}else{
dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()).load(dataPath, "UTF-8");
}
System.out.println("Dataset prepared!");
} catch (IOException e) {
e.printStackTrace();
}
classifier = new NaiveBayesClassifier();
classifier.train(dataSet);
model = classifier.getModel();
saveModel(modelPath, model);
}else{
System.out.println("NaiveBayesModel init succeeded!");
classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
}
}
private static void saveModel(String modelPath, AbstractModel model){
try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelPath))) {
oos.writeObject(model);
System.out.println("Save NaiveBayesModel Succeeded!");
} catch (Exception e) {
System.err.println("Save NaiveBayesModel Failed!");
System.err.println(e.getMessage());
}
}
private static AbstractModel loadModel(String modelPath){
try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath))) {
Object o = ois.readObject();
return (AbstractModel) o;
} catch (FileNotFoundException e) {
System.err.println("Load NaiveBayesModel Failed(NaiveBayesModel file:" + modelPath+" not Found!)");
} catch (Exception e) {
System.err.println(e.getMessage());
}
return null;
}
public static Double getScoreOfWeiboComment(String sentence){
if(classifier==null){
System.err.println("Classifier is null, default using weibo comment data to init classifier");
System.out.println("If you want to use different data to init classifier, call initClassifier first");
initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
}
Map<String, Double> map = classifier.predict(sentence);
return map.get("1") - map.get("0");
}
public static String getClassification(String sentence) {
if(classifier==null){
System.err.println("Classifier is null, default using weibo comment data to init classifier");
System.out.println("If you want to use different data to init classifier, call initClassifier first");
initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
}
Map<String, Double> map = classifier.predict(sentence);
// System.out.println(map);
return classifier.classify(sentence);
}
}
package cn.edu.neu.hanlp;
import cn.edu.neu.hanlp.cons.ClassifierConstant;
/**
* @author 32098
*
* 情感分类、中文文本分类
*/
public class Test {
public static void main(String[] args) {
HanLpClassifier.initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
System.out.println(HanLpClassifier.getClassification("天安门"));
System.out.println(HanLpClassifier.getClassification("哇哦今年的春夏季衣服不错诶"));
System.out.println(HanLpClassifier.getClassification("去死吧"));
System.out.println(HanLpClassifier.getClassification("加油"));
System.out.println(HanLpClassifier.getClassification("你真好"));
System.out.println(HanLpClassifier.getScoreOfWeiboComment("你真好"));
HanLpClassifier.initClassifier(ClassifierConstant.DATASET_HOTEL_PATH, ClassifierConstant.HOTEL_MODEL_PATH);
System.out.println(HanLpClassifier.getClassification("酒店太差了"));
HanLpClassifier.initClassifier(ClassifierConstant.DATASET_SOUGOU_PATH, ClassifierConstant.SOUGOU_MODEL_PATH);
System.out.println(HanLpClassifier.getClassification("篮球、羽毛球"));
}
}
运行结果:
四、SparkML 中文情感分类(待定)
暂略