HMM实现中文分词
import numpy as np
import warnings
from hmmlearn.hmm import MultinomialHMM as mhmm
data=[{u"我要吃飯":"SSBE"},
{
u"天氣不錯" : "BEBE"},
{
u"謝天謝地" : "BMME"}]
def prints(s):passprint(s)
def get_startprob():"""get BMES matrix """c=0c_map={"B":0,"M":0,"E":0,"S":0}#caculate the countfor v in data :for key in v :value=v[key]c=c+1prints("value[0] is "+value[0])c_map[value[0]]=c_map[value[0]] +1prints("c_map[value[0]] is "+str(c_map[value[0]]) )res=[]for i in "BMES":res.append( c_map[i] / float(c))return resdef get_transmat():"""get transmat of status"""c=0#record BE:1,BB:2c_map={}for v in data :for key in v :value=v[key]prints("value[0] is "+value[0])for v_i in range(len(value)-1):couple=value[v_i:v_i+2]c_couple_source = c_map.get(couple,0)c_map[couple]=c_couple_source+1c=c+1 #c_map[value[0]]=c_map[value[0]] +1#prints("c_map[value[0]] is "+str(c_map[value[0]]) )prints("get_transmat's c_map is "+str(c_map))res=[]for i in "BMES":col=[]col_count=0for j in "BMES":col_count=c_map.get(i+j,0)+col_count for j in "BMES": col.append( c_map.get(i+j,0) / float(col_count))res.append(col)return res
def get_words():return u"我要吃飯天氣不錯謝天地"
def get_word_map():words=get_words()res={}for i in range(len(words)):res[words[i]]=ireturn res
def get_array_from_phase(phase):word_map=get_word_map()res=[]for key in phase:res.append(word_map[key])return res
def get_emissionprob():#get emmissionprob of status and observersc=0#record Bc=0#record B我:1,B吃:2c_map={}for v in data : for key in v :k=keyvalue=v[key]prints("value[0] is "+value[0])for v_i in range(len(value)):couple=value[v_i]+k[v_i]prints("emmition's couple is " + couple)c_couple_source = c_map.get(couple,0)c_map[couple]=c_couple_source+1c=c+1res=[]prints("emmition's c_map is "+str(c_map))words=get_words()for i in "BMES":col=[]for j in words:col.append( c_map.get(i+j,0) / float(c))res.append(col)return res
if( __name__ == "__main__"):# print("startprob is ",get_startprob())# print("transmat is " ,get_transmat())print("emissionprob is " , get_emissionprob())print("word map is ",get_word_map())# coding=utf-8warnings.filterwarnings("ignore")# import matplotlib.pyplot as pltstartprob = np.array(get_startprob())print("startprob is ", startprob)transmat = np.array(get_transmat())print("transmat is ", transmat)emissionprob = np.array(get_emissionprob())print("emmissionprob is ", emissionprob)mul_hmm = mhmm(n_components=4)mul_hmm.startprob_ = startprobmul_hmm.transmat_ = transmatmul_hmm.emissionprob_ = emissionprobphase = u"我要吃飯謝天謝地"X = np.array(get_array_from_phase(phase))X = X.reshape(len(phase), 1)print("X is ", X)Y = mul_hmm.predict(X)print("Y is ", Y)# {B(詞開頭),M(詞中),E(詞尾),S(獨字詞)} {0,1,2,3}
out
F:\anaconda\pythonw.exe D:/學習資料/網易云課堂/唐宇迪-機器學習課程(新)/自然語言處理(Python版)/第八章:HMM實戰/HMM案例實戰/HMM/get_hmm_param.py value[0] is S emmition's couple is S我 emmition's couple is S要 emmition's couple is B吃 emmition's couple is E飯 value[0] is B emmition's couple is B天 emmition's couple is E氣 emmition's couple is B不 emmition's couple is E錯 value[0] is B emmition's couple is B謝 emmition's couple is M天 emmition's couple is M謝 emmition's couple is E地 emmition's c_map is {'S我': 1, 'S要': 1, 'B吃': 1, 'E飯': 1, 'B天': 1, 'E氣': 1, 'B不': 1, 'E錯': 1, 'B謝': 1, 'M天': 1, 'M謝': 1, 'E地': 1} emissionprob is [[0.0, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.08333333333333333, 0.0], [0.0, 0.0, 0.0, 0.0, 0.08333333333333333, 0.0, 0.0, 0.0, 0.08333333333333333, 0.08333333333333333, 0.0], [0.0, 0.0, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.0, 0.08333333333333333], [0.08333333333333333, 0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] word map is {'我': 0, '要': 1, '吃': 2, '飯': 3, '天': 9, '氣': 5, '不': 6, '錯': 7, '謝': 8, '地': 10} value[0] is S c_map[value[0]] is 1 value[0] is B c_map[value[0]] is 1 value[0] is B c_map[value[0]] is 2 startprob is [0.66666667 0. 0. 0.33333333] value[0] is S value[0] is B value[0] is B get_transmat's c_map is {'SS': 1, 'SB': 1, 'BE': 3, 'EB': 1, 'BM': 1, 'MM': 1, 'ME': 1} transmat is [[0. 0.25 0.75 0. ][0. 0.5 0.5 0. ][1. 0. 0. 0. ][0.5 0. 0. 0.5 ]] value[0] is S emmition's couple is S我 emmition's couple is S要 emmition's couple is B吃 emmition's couple is E飯 value[0] is B emmition's couple is B天 emmition's couple is E氣 emmition's couple is B不 emmition's couple is E錯 value[0] is B emmition's couple is B謝 emmition's couple is M天 emmition's couple is M謝 emmition's couple is E地 emmition's c_map is {'S我': 1, 'S要': 1, 'B吃': 1, 'E飯': 1, 'B天': 1, 'E氣': 1, 'B不': 1, 'E錯': 1, 'B謝': 1, 'M天': 1, 'M謝': 1, 'E地': 1} emmissionprob is [[0. 0. 0.08333333 0. 0.08333333 0.0.08333333 0. 0.08333333 0.08333333 0. ][0. 0. 0. 0. 0.08333333 0.0. 0. 0.08333333 0.08333333 0. ][0. 0. 0. 0.08333333 0. 0.083333330. 0.08333333 0. 0. 0.08333333][0.08333333 0.08333333 0. 0. 0. 0.0. 0. 0. 0. 0. ]] X is [[ 0][ 1][ 2][ 3][ 8][ 9][ 8][10]] Y is [3 3 0 2 0 1 1 2]Process finished with exit code 0總結