java实现正向最大匹配分词

原创

yan456jie 2023-07-10 20:39:46 博主文章分类：NLP ©著作权

©著作权归作者所有：来自51CTO博客作者yan456jie的原创作品，请联系作者获取转载授权，否则将追究法律责任

1、下载mmseg4j-1.8.5分词器，取其中words.dic词典下载地址

package com.yj.nlp_common.seg.MyMMSeg;

import java.util.HashMap;

/**
 * 构建内存词典的Trie树结点
 *
 */
public class TrieNode {
    /** 结点关键字，其值为中文词中的一个字 */
    public char key = '\0';

    /** 如果该字在词语的末尾，则bound=true */
    public boolean bound = false;

    /** 指向下一个结点的指针结构，用来存放当前字在词中的下一个字的位置 */
    public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

    public TrieNode() {}

    public TrieNode(char key) {
        this.key = key;
    }
}

package com.yj.nlp_common.seg.MyMMSeg;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;

/**
 * Created by Administrator on 2017/3/3.
 */
public class TrieDictionary {

    private static TrieDictionary trieDictionary = null;

    private static List<String> wordlist = null;

    private static TrieNode root = null;

    public TrieNode getRoot(){
        return root;
    }

    public static TrieDictionary getInstance(String dictionaryName){
        if (trieDictionary==null) {
            trieDictionary = new TrieDictionary(dictionaryName);
        }
        return trieDictionary;
    }

    public static TrieDictionary getInstance(){
        String dictionaryName = TrieDictionary.class.getClassLoader().getResource("MyMM//words.dic").getPath();
        if (trieDictionary==null) {
            trieDictionary = new TrieDictionary(dictionaryName);
        }
        return trieDictionary;
    }

    private TrieDictionary(String dictionaryName){
        String filePath = TrieDictionary.class.getClassLoader().getResource("MyMM//words.dic").getPath();
        try {
            wordlist = FileUtils.readLines(new File(filePath));
            root = new TrieNode();
            for(String word: wordlist){
                addWord(word);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void addWord(String word){
        TrieNode current = root;
        for (int i=0; i<word.length();++i) {
            char c = word.charAt(i);
            TrieNode node = new TrieNode(c);
            if (i == word.length() - 1) {
                node.bound=true;
            }

            HashMap<Character, TrieNode> childs = current.childs;
            if (childs.containsKey(c)) {
                current = childs.get(c);
            }else{
                childs.put(c, node);
                current = node;
            }
        }


    }

}

package com.yj.nlp_common.seg.MyMMSeg;

/**
 * Created by Administrator on 2017/3/3.
 */
public class CharacterType {

    /**
     * 是分隔符
     * @param c
     * @return
     */
    public static boolean isCharSeperator(char c) {
        return "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ".indexOf(c) != -1;
    }

    /**
     * 是中文
     * @param c
     * @return
     */
    public static boolean isCharChinese(char c) {
        return c >= '\u4E00' && c <= '\u9FBF';
    }

    /**
     * 其他字符
     * @param c
     * @return
     */
    public static boolean isCharOther(char c) {
        return !isCharSeperator(c) && !isCharChinese(c);
    }

    //private static final String C_E_SEPERATOR = "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ";
    //private static final String str = "。！？：；、，（）《》【】{}“”‘’!?:;,()<>[]{}\"'\n\r\t ";
}

package com.yj.nlp_common.seg.MyMMSeg;

import java.io.IOException;
/**
 * 分词
 */
public class MMSegmenter {
    public static TrieDictionary dict = null;

    static { //加载词典
        dict = TrieDictionary.getInstance();
    }

    /**
     *
     * @param sentence
     * @return
     */
    public String segment(String sentence) {
        StringBuffer segBuffer = new StringBuffer();

        TrieNode root = dict.getRoot();

        TrieNode cur = root;

        int length = sentence.length();

        for (int i = 0; i < length; ++i) {
            char c = sentence.charAt(i);
            if (CharacterType.isCharChinese(c)) {//识别出一个中文词
                cur = cur.childs.get(c);
                if(cur==null){// 不在词典中的中文字符
                    segBuffer.append(c);
                    segBuffer.append('|'); //添加分词标记
                    cur = root;
                }else{// 在词典中的中文字符
                    do {
                        segBuffer.append(c);
                        if (++i==length) {
                            break;
                        }
                        c = sentence.charAt(i);
                        cur = cur.childs.get(c);
                    }while(CharacterType.isCharChinese(c) && cur!=null);
                    if (i!=length) --i;//还原现场
                    segBuffer.append('|'); //添加分词标记
                    cur = root;
                }
            }else if(CharacterType.isCharOther(c)){//识别出一个其他语言单词
                do{
                    segBuffer.append(c);
                    if (++i==length) {
                        break;
                    }
                    c = sentence.charAt(i);
                }while(CharacterType.isCharOther(c));
                if (i!=length) --i;//还原现场
                segBuffer.append('|'); //添加分词标记
                cur = root;
            }else if(CharacterType.isCharSeperator(c)){//可以多个连续分隔符

            }

        }


        return new String(segBuffer);
    }

    public static void main(String args[]) throws IOException {
        MMSegmenter mmsegger = new MMSegmenter();
        System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello中国人，，，我是中国人"));

        //System.out.println(CharacterType.isCharSeperator(' '));
    }
}