对于文本信息的向量化,Mahout 已经提供了工具类,它基于 Lucene 给出了对文本信息进行分析,然后创建文本向量。mahout提供下面两个命令来将文本转成向量形式(转化成向量后可以聚类):
1.mahout seqdirectory:将文本文件转成SequenceFile文件,SequenceFile文件是一种二制制存储的key-value键值对,对应的源文件是org.apache.mahout.text.SequenceFilesFromDirectory.java

2.mahout seq2sparse:将SequenceFile转成向量文件,对应的源文件是org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java

我是将mahout源码导入到eclipse中,对以上的两个源文件分别进行运行(运行时必须配置参数,有输入、输出、字符编码)转化的,生成的向量文件目录结构是:
df-count 目录:保存着文本的频率信息 
tf-vectors 目录:保存着以 TF 作为权值的文本向量 
tfidf-vectors 目录:保存着以 TFIDF 作为权值的文本向量 
tokenized-documents 目录:保存着分词过后的文本信息 
wordcount 目录:保存着全局的词汇出现的次数 
dictionary.file-0 目录:保存着这些文本的词汇表 
frequcency-file-0 目录 : 保存着词汇表对应的频率信息。

查看转化结果:

mahout seqdumper:将SequenceFile文件转成文本形式,对应的源文件是org.apache.mahout.utils.SequenceFileDumper.java
mahout vectordump:将向量文件转成可读的文本形式,对应的源文件是org.apache.mahout.utils.vectors.VectorDumper.java
mahout clusterdump:分析最后聚类的输出结果,对应的源文件是org.apache.mahout.utils.clustering.ClusterDumper.java具体每种命令如何用及参数如何选择,在命令行后面加-h或-help可以查看

下面是我在项目中用到的一些源码


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.utils.io.ChunkedWriter;

import com.google.common.io.Closeables;


public class WriteToSequenceFileForBayesian  extends AbstractJob{//使用聚类的文件,所以要传入一个num,比如2000,表示以2000为单位,训练集的划分规范。
	public static void main(String args[]) throws Exception{
		ToolRunner.run(new WriteToSequenceFileForBayesian(), args);
    }
	@Override
	public int run(String[] arg0) throws Exception {
	    String inputPath=arg0[0];//
    	String outputpoints=arg0[1];//
        int k = Integer.parseInt(arg0[2]);//
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path inPath = new Path(inputPath );
        FSDataInputStream dis = fs.open(inPath);
        LineReader in = new LineReader(dis,conf);  
        ChunkedWriter writer = new ChunkedWriter(conf, 64, new Path(outputpoints));
        Text line = new Text();
        //按行读取
        long recNum = 0;
        StringBuilder ss=new StringBuilder();
        while(in.readLine(line) > 0){
        	String aline=line.toString();
        	String[] strs=aline.split(" ");
        	if (recNum
   
   
    
    -2&&Double.parseDouble(strs[0])<4) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<-2) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>4) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//处理第2个数
        		if (Double.parseDouble(strs[1])>-3&&Double.parseDouble(strs[1])<3) {
					ss.append("zero_second");
				}else if (Double.parseDouble(strs[1])<-3) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>3) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//处理第3个数
        		if (Double.parseDouble(strs[2])>-2&&Double.parseDouble(strs[2])<4) {
					ss.append("one_third");
				}else if (Double.parseDouble(strs[2])<-2) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>4) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//处理第4个数
        		if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<5) {
					ss.append("two_fourth");
				}else if (Double.parseDouble(strs[3])<-1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>5) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//处理第5个数
        		if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<4) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<-2) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>4) {
					ss.append("high_fifth");
				}
        		writer.write("first", ss.toString());
			}else if (recNum
    
    
     
     1.5&&Double.parseDouble(strs[0])<2.5) {
					ss.append("two_first");
				}else if (Double.parseDouble(strs[0])<1.5) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>2.5) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//处理第2个数
        		if (Double.parseDouble(strs[1])>0.5&&Double.parseDouble(strs[1])<1.5) {
					ss.append("one_second");
				}else if (Double.parseDouble(strs[1])<0.5) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>1.5) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//处理第3个数
        		if (Double.parseDouble(strs[2])>-0.5&&Double.parseDouble(strs[2])<0.5) {
					ss.append("zero_third");
				}else if (Double.parseDouble(strs[2])<-0.5) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>0.5) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//处理第4个数
        		if (Double.parseDouble(strs[3])>0.5&&Double.parseDouble(strs[3])<1.5) {
					ss.append("one_fourth");
				}else if (Double.parseDouble(strs[3])<0.5) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>1.5) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//处理第5个数
        		if (Double.parseDouble(strs[4])>0.5&&Double.parseDouble(strs[4])<1.5) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<0.5) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>1.5) {
					ss.append("high_fifth");
				}
        		writer.write("second", ss.toString());
			}else if (recNum
     
     
      
      0.9&&Double.parseDouble(strs[0])<1.1) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<0.9) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>1.1) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//处理第2个数
        		if (Double.parseDouble(strs[1])>0.9&&Double.parseDouble(strs[1])<1.1) {
					ss.append("one_second");
				}else if (Double.parseDouble(strs[1])<0.9) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>1.1) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//处理第3个数
        		if (Double.parseDouble(strs[2])>1.9&&Double.parseDouble(strs[2])<2.1) {
					ss.append("two_third");
				}else if (Double.parseDouble(strs[2])<1.9) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>2.1) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//处理第4个数
        		if (Double.parseDouble(strs[3])>-0.1&&Double.parseDouble(strs[3])<0.1) {
					ss.append("zero_fourth");
				}else if (Double.parseDouble(strs[3])<-0.1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>0.1) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//处理第5个数
        		if (Double.parseDouble(strs[4])>0.9&&Double.parseDouble(strs[4])<1.1) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<0.9) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>1.1) {
					ss.append("high_fifth");
				}
        		writer.write("third", ss.toString());
			}else if (recNum
      
      
       
       -1&&Double.parseDouble(strs[0])<3) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<-1) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>3) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//处理第2个数
        		if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<4) {
					ss.append("two_second");
				}else if (Double.parseDouble(strs[1])<0) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>4) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//处理第3个数
        		if (Double.parseDouble(strs[2])>-1&&Double.parseDouble(strs[2])<3) {
					ss.append("one_third");
				}else if (Double.parseDouble(strs[2])<-1) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>3) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//处理第4个数
        		if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<3) {
					ss.append("one_fourth");
				}else if (Double.parseDouble(strs[3])<-1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>3) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//处理第5个数
        		if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<2) {
					ss.append("zero_fifth");
				}else if (Double.parseDouble(strs[4])<-2) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>2) {
					ss.append("high_fifth");
				}
        		writer.write("fourth", ss.toString());
			}else if (recNum
       
       
         -1&&Double.parseDouble(strs[0])<1) { ss.append("zero_first"); }else if (Double.parseDouble(strs[0])<-1) { ss.append("low_first"); }else if (Double.parseDouble(strs[0])>1) { ss.append("high_first"); } ss.append(","); //处理第2个数 if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<2) { ss.append("one_second"); }else if (Double.parseDouble(strs[1])<0) { ss.append("low_second"); }else if (Double.parseDouble(strs[1])>2) { ss.append("high_second"); } ss.append(","); //处理第3个数 if (Double.parseDouble(strs[2])>0&&Double.parseDouble(strs[2])<2) { ss.append("one_third"); }else if (Double.parseDouble(strs[2])<0) { ss.append("low_third"); }else if (Double.parseDouble(strs[2])>2) { ss.append("high_third"); } ss.append(","); //处理第4个数 if (Double.parseDouble(strs[3])>0&&Double.parseDouble(strs[3])<2) { ss.append("one_fourth"); }else if (Double.parseDouble(strs[3])<0) { ss.append("low_fourth"); }else if (Double.parseDouble(strs[3])>2) { ss.append("high_fourth"); } ss.append(","); //处理第5个数 if (Double.parseDouble(strs[4])>1&&Double.parseDouble(strs[4])<3) { ss.append("two_fifth"); }else if (Double.parseDouble(strs[4])<1) { ss.append("low_fifth"); }else if (Double.parseDouble(strs[4])>3) { ss.append("high_fifth"); } writer.write("fifth", ss.toString()); } } Closeables.close(writer, false); dis.close(); in.close(); return 0; } }




import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.mahout.clustering.kmeans.Kluster;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.text.SequenceFilesFromDirectory;

public class WriteToSequenceFile {
    public static void main(String args[]) throws Exception {
    	String inputPath=args[0];//文本数据文件输入目录
    	String outputpoints=args[1];//sequenceFile中的point数据输出目录
    	String outputclusters=args[2];//sequenceFile中的cluster数据输出目录
        int k = Integer.parseInt(args[3]);//k个中心
        List
   
   
    
     vectors = new ArrayList
    
    
     
     ();
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path inPath = new Path(inputPath );
        FSDataInputStream dis = fs.open(inPath);
        LineReader in = new LineReader(dis,conf);  
        Text line = new Text();
        SequenceFile.Writer pointwriter = new SequenceFile.Writer(fs, conf, new Path(outputpoints), LongWritable.class, VectorWritable.class);
        //按行读取
        long recNum = 0;
        VectorWritable vecWrite = new VectorWritable();
        while(in.readLine(line) > 0){
        	String aline=line.toString();
        	String[] strs=aline.split(" ");
        	double[] fr = new double[5];
        	for (int i = 0; i < strs.length; i++) {
				fr[i]=Double.parseDouble(strs[i]);
			}
            Vector vec = new RandomAccessSparseVector(fr.length);
            vec.assign(fr);
            vecWrite.set(vec);
            pointwriter.append(new LongWritable(recNum++), vecWrite);
            if (vectors.size()