本文思维导图

  1. 基于HMM的分词算法
  2. 对分词统计tf-idf参数
停用词库
  1. 本文选用的停用词库来自 https://github.com/witlxx/tf-idf/blob/v0.0.1/stop_words.txt
  2. 上述停用词库参考了 https://github.com/goto456/stopwords中的cn_stopwords.txt
训练词库和测试词库
  1. 本文的训练词库见https://github.com/witlxx/tf-idf/blob/v0.0.1/trains.txt
  2. 本文的测试测库见
HMM算法的五个参数
  1. 隐含状态向量 states = {“B”, “E”, “M”, “S”}
  2. 显示结果序列: 待分词的一段话
对于训练词库的处理
1) 首先需要对训练库中的每个汉字加标签,标签为隐含状态向量states中的元素;
	2) 标签规则是: 单个汉字的标签为"S"; 两个汉字组成的词的第一个字的标签为"B",第二个字的标签为"E"; 两个字以上的词,第一个字的标签为"S",中间几个字的标签为"M",最后一个字的标签为"E";
  1. 发射概率矩阵 emitpm
统计S1下各个汉字(Oi)出现的次数,S1∈{"B", "E", "M", "S"},然后将(S1, Oi)的次数除以标记为S1的次数。
  1. 转移概率矩阵 transpm
统计每个S1(开始状态)->T1(转移到的状态)的频数,然后将(S1,T1)的个数除以开始状态为S1的个数。S1∈{"B", "E", "M", "S"};T1∈{"B", "E", "M", "S"}。
  1. 初始概率向量 initpvec
统计在训练分词的标签中S1出现的次数,S1∈{"B", "E", "M", "S"},然后将每个频数除以总的标签数。
代码示例
一、数据预处理代码
const fs = require("fs");
/**
 * 数组去重
 * @function removeDulp
 * @param {[Any]} list
 * @returns {[Any]}
 */
let removeDulp = (list) => {
  let listEnum = {};
  let len = list.length;
  for (let i = 0; i < len; i++) {
    listEnum[list[i]] = list[i];
  }
  return Object.keys(listEnum);
};

/**
 * 暴力匹配sentence中patch的位置
 * @param {*} sentence
 * @param {*} patch
 * @param {Number} count 第几次调用此函数
 */
let violenceMatch = (sentence, patch, count) => {
  if (!count) count = 0;
  let lenSen = sentence.length,
    i = 0;
  let lenPat = patch.length,
    j = 0;
  while (i < lenSen && j < lenPat) {
    if (sentence[i] === patch[j]) {
      i += 1;
      j += 1;
    } else {
      i = i - j + 1;
      j = 0;
    }
  }
  if (j !== lenPat) return { sentence };
  return {
    start: i - lenPat + count * lenPat,
    end: i + count * lenPat,
    sentence:
      sentence.slice(0, i - lenPat) + sentence.slice(i, sentence.length),
  };
};

/**
 * 循环匹配停用词,清理contents中的所有stopword
 * @param {*} contents
 * @param {*} stopword
 */
let recurrentVolMat = (contents, stopword) => {
  let cout = 0,
    range = [];
  while (true) {
    flag = contents.length;
    let data = violenceMatch(contents, stopword, cout);
    let { start, end } = data;
    if (Number.isInteger(start) && Number.isInteger(end)) {
      range.push([start, end]);
    }
    cout++;
    contents = data.sentence;
    if (flag === violenceMatch(contents, stopword).sentence.length) {
      return { data, range };
    }
  }
};

/**
 * 标记正确的停用词位置
 * @function accumulateOrder
 * @param {*} positions
 */
let accumulateOrder = (positions) => {
  return positions;
};

/**
 * 标记停用词位置,并返回去除停用词的数组
 * @function stopWords
 * @param {[Any]}
 * @returns {[]}
 */
let stopWords = (contents) => {
  let result,
    ranges = [];
  const StopWords = removeDulp(
    fs.readFileSync("./stop_words.txt").toString().split("\r\n")
  );
  for (let i = 0; i < StopWords.length; i++) {
    result = recurrentVolMat(contents, StopWords[i]);
    contents = result.data.sentence;
    ranges.push(result.range);
  }
  let lRanges = [];
  for (let j = 0; j < ranges.length; j++) {
    lRanges.push(ranges[j]);
  }
  lRanges = accumulateOrder(lRanges);
  return { content: result.data.sentence, lRanges };
};
二、训练词库处理结果
let trainsTxt = removeDulp(
  stopWords(fs.readFileSync("./trains.txt").toString())
    .content.split("\r\n")
    .filter((i) => i != " ")
    .map((j) => j.trim())
);

结果为

[
  '成果',         '市场化',       '跨周期',   '降低',         '存款准备金',
  '微企业',       '贷款延期',     '本付息',   '信贷款',       '两项',
  '货币政策工具', '精准扶贫',     '制造业',   '绿色产业',     '信贷力度',
  '宏观审慎',     '政策框架',     '进步',     '健全',         '持续',
  '构建',         '要素完备',     '逆周期',   '资本缓机制',   '压力测试',
  '完善',         '系统性风险',   '系统性',   '监测',         '评估',
  '预警',         '跨境资金流动', '管理框架', '统筹',         '监管',
  '系统',         '性',           '金融机构', '金融控股公司', '取',
  '风险',         '攻坚战',       '阶段性',   '流动性',       '累计',
  '人民币',       '汇率',         '合理',     '均衡',         '基本',
  '金融',         '稳',           '企业',     '保业',         '预期效果',
  '经济规律',     '分层次',       '有梯度',   '政策',         '抗疫保供专项',
  '贷款',         '复工复产',     '贴现',     '普惠性',       '额度',
  '普惠',         '法制化',       '长期',     '动摇',         '形势',
  '变化',         '时',           '调整',     '阶段',         '处置',
  '点',           '优先序',       '稳健',     '货币政策',     '灵活',
  '适度',         '总量政策',     '融资成本', '实体经济',     '政策取',
  '前瞻性',       '利率',         '降准'
]
三、测试词库处理结果
const textTxt1 = stopWords(fs.readFileSync("./test1.txt").toString()).content;
const textTxt2 = stopWords(fs.readFileSync("./test2.txt").toString()).content;
const textTxt3 = stopWords(fs.readFileSync("./test3.txt").toString()).content;
const textTxt4 = stopWords(fs.readFileSync("./test4.txt").toString()).content;

结果为

TextTxt1: 稳健货币政策灵活适度总量政策适度降低融资成本实体经济政策取跨周期降低存款准备金率长期流动性累计货币政策前瞻性利率降实现金融系统实体经济利人民币汇率合理均衡基本
TextTxt2: 金融稳企业保业取预期效果经济规律分层次有梯度金融政策抗疫保供专项贷款复工复产贷款贴现普惠性贷款贴现额度普惠微企业贷款延期本付息普惠微企业信贷款两项实体经济货币政策工具加精准扶贫制造业绿色产业 
信贷力度
TextTxt3: 宏观审慎政策框架进步健全持续构建要素完备宏观审慎政策框架逆周期资本缓机制宏观审慎压力测试完善系统性风险监测评估预警健全跨境资金流动宏观审慎管理框架统筹监管系统性金融机构金融控股公司取
TextTxt4: 金融风险攻坚战取阶段性成果市场化法治化动摇,形势变化时调整阶段金融风险处置点优先序
四、为训练词库添加状态标签
/**
 * 生成训练词库中的观测序列的隐藏状态标记
 * @function genHiddenStates
 * @param {*} objects
 */
let genHiddenStates = (objects) => {
  let stateLists = [];
  let state = [];
  for (let i = 0; i < objects.length; i++) {
    state = [];
    if (objects[i].length === 1) {
      state = ["S"];
    } else if (objects[i].length === 2) {
      state = ["B", "E"];
    } else {
      state.push("B");
      for (let k = 0; k < objects[i].length - 2; k++) {
        state.push("M");
      }
      state.push("E");
    }
    stateLists.push(state);
  }
  return stateLists;
};

标注结果为

[
  [ 'B', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'E' ],[ 'B', 'M', 'M', 'M', 'E' ],
  [ 'B', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'M', 'E' ],[ 'B', 'E' ],
  [ 'B', 'M', 'M', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'M', 'M', 'M', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ],
  [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'M', 'M', 'M', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'S' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'M', 'M', 'E' ], [ 'S' ], [ 'B', 'E' ],
  [ 'B', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'E' ],
  [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'S' ], [ 'B', 'E' ],
  [ 'B', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ],
  [ 'B', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'M', 'M', 'M', 'E' ], [ 'B', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ],
  [ 'B', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'S' ], 
  [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'S' ], [ 'B', 'M', 'E' ], [ 'B', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'E' ], [ 'B', 'E' ], [ 'B', 'M', 'M', 'E' ],
  [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'M', 'E' ], [ 'B', 'M', 'E' ], [ 'B', 'M', 'E' ],
  [ 'B', 'E' ], [ 'B', 'E' ]
]
五、生成初始概率向量
let genInitProVector = (trainsSates) => {
  let count = 0;
  let initNumVector = {};
  let initProVector = {};
  STATES.forEach((state) => {
    initNumVector[state] = 0;
  });
  for (let i = 0; i < trainsSates.length; i++) {
    for (let key = 0; key < trainsSates[i].length; key++) {
      initNumVector[trainsSates[i][key]] += 1;
      count += 1;
    }
  }
  for (let i = 0; i < STATES.length; i++) {
    initProVector[STATES[i]] = initNumVector[STATES[i]] / count;
  }
  return initProVector;
};

结果为

{
  B: 0.3333333333333333,
  E: 0.3333333333333333,
  M: 0.3132530120481928,
  S: 0.020080321285140562
}
六、生成状态转移概率矩阵
let genTransProMatrix = (trainsSates) => {
  let tranCount = {};
  let transNumMatrix = {};
  let transProMatrix = {};
  STATES.forEach((origin) => {
    transNumMatrix[origin] = {};
    transProMatrix[origin] = {};
    tranCount[origin] = 0;
    STATES.forEach((target) => {
      transNumMatrix[origin][target] = 0;
      transProMatrix[origin][target] = 0;
    });
  });
  trainsSates.forEach((ele) => {
    for (let i = 0; i < ele.length; i++) {
      if (i < ele.length - 1) {
        transNumMatrix[ele[i]][ele[i + 1]] += 1;
        tranCount[ele[i]] += 1;
      }
    }
  });
  //特殊处理: S->S的概率为1, E->E的概率为1
  STATES.forEach((origin) => {
    if (tranCount[origin] === 0) {
      transNumMatrix[origin][origin] = 1;
      tranCount[origin] = 1;
    }
  });
  STATES.forEach((origin) => {
    STATES.forEach((target) => {
      transProMatrix[origin][target] =
        transNumMatrix[origin][target] / tranCount[origin];
    });
  });
  return transProMatrix;
};

结果为

{
  B: { B: 0, E: 0.4819277108433735, M: 0.5180722891566265, S: 0 },
  E: { B: 0, E: 1, M: 0, S: 0 },
  M: { B: 0, E: 0.5512820512820513, M: 0.44871794871794873, S: 0 },
  S: { B: 0, E: 0, M: 0, S: 1 }
}
七、生成发射概率矩阵
let genEmitProMatrix = (trainsSates, trainsObjects) => {
  let wordLists = [];
  trainsObjects.forEach((object) => {
    for (let j = 0; j < object.length; j++) {
      wordLists.push(object[j]);
    }
  });
  let emitNumMatrix = {};
  let emitProMatrix = {};
  let emitCount = {};
  STATES.forEach((state) => {
    emitNumMatrix[state] = {};
    emitProMatrix[state] = {};
    emitCount[state] = 0;
    wordLists.forEach((word) => {
      emitNumMatrix[state][word] = 0;
      emitProMatrix[state][word] = 0;
    });
  });
  for (let i = 0; i < trainsSates.length; i++) {
    for (let j = 0; j < trainsSates[i].length; j++) {
      emitNumMatrix[trainsSates[i][j]][trainsObjects[i][j]] += 1;
      emitCount[trainsSates[i][j]] += 1;
    }
  }
  STATES.forEach((state) => {
    wordLists.forEach((word) => {
      emitProMatrix[state][word] =
        emitNumMatrix[state][word] / emitCount[state];
    });
  });
  return emitProMatrix;
};

部分结果为

{B: {'成': 0.012048192771084338, '果': 0, '市': 0.012048192771084338},
 E: {'果': 0.024096385542168676, '市': 0, '场': 0, '化': 0.03614457831325301,},
 M: {'周': 0.02564102564102564, '期': 0.01282051282051282,},
 S: {'稳': 0.2}}
八、维特比算法分词
let Bubblesort = (disSet) => {
  let len = disSet.length;
  //大循环是来回检查,检查了len-1次
  for (let i = 0; i < len - 1; i++) {
    //小循环是排序,小循环每次都会把最大的放在最右边[所以小循环只要比较len-i次]
    for (let j = 0; j < len - 1 - i; j++) {
      if (disSet[j].value < disSet[j + 1].value) {
        let temp = disSet[j + 1];
        disSet[j + 1] = disSet[j];
        disSet[j] = temp;
      }
    }
  }
  return disSet;
};

/**
 * @function maxProbIndex
 * @param {*} options
 */
let maxProbIndex = (options) => {
  //选择概率最大的标签
  let data = [];
  for (let i = 0; i < Object.keys(options).length; i++) {
    data.push({
      state: Object.keys(options)[i],
      value: options[Object.keys(options)[i]],
    });
  }
  data = Bubblesort(data)[0].state;
  return data;
};

let viterbi = (initProVector, transProMatrix, emitProMatrix, testObjects) => {
  let testList = testObjects.split("");
  let steps = [];
  steps.push({});
  STATES.forEach((state) => {
    steps[0][state] = initProVector[state] * emitProMatrix[state][testList[0]];
  });
  let path = [maxProbIndex(steps[0])]; //例如: path = ["B"]
  for (let i = 1; i < 4; i++) {
    let lastState = path[path.length - 1];
    steps.push({});
    STATES.forEach((thisState) => {
      steps[i][thisState] =
        steps[i - 1][lastState] *
        emitProMatrix[thisState][testList[i]] *
        transProMatrix[lastState][thisState];
    });
    path.push(maxProbIndex(steps[i]));
  }
  return path;
};