這篇文章主要是用 Python 實現(xiàn)了簡單的中文分詞的同音字糾錯,目前的案例中只允許錯一個字,自己如果有興趣可以繼續(xù)優(yōu)化下去。具體步驟如下所示:
import re,pinyin
from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag
class corrector():
def __init__(self):
self.re_compile = re.compile(r'[\u4e00-\u9fff]')
self.DAG = DefaultDagParams()
# 將文件中的詞讀取
def getData(self):
words = []
with open("/Users/wys/Desktop/token.txt") as f:
for line in f.readlines():
word = line.split(" ")[0]
if word and len(word) > 2:
res = self.re_compile.findall(word)
if len(res) == len(word): ## 保證都是漢字組成的分詞
words.append(word)
return words
# 將每個拼音轉(zhuǎn)換成同音的 10 個候選漢字,
def pinyin_2_hanzi(self, pinyinList):
result = []
words = dag(self.DAG, pinyinList, path_num=10)
for item in words:
res = item.path # 轉(zhuǎn)換結(jié)果
result.append(res[0])
return result
# 獲得詞經(jīng)過轉(zhuǎn)換的候選結(jié)結(jié)果
def getCandidates(self, phrase):
chars = {}
for c in phrase:
chars[c] = self.pinyin_2_hanzi(pinyin.get(c, format='strip', delimiter=',').split(','))
replaces = []
for c in phrase:
for x in chars[c]:
replaces.append(phrase.replace(c, x))
return set(replaces)
# 獲得糾錯之后的正確結(jié)果
def getCorrection(self, words):
result = []
for word in words:
for word in self.getCandidates(word):
if Tree.search(word):
result.append(word)
break
return result
class Node:
def __init__(self):
self.word = False
self.child = {}
class Trie(object):
def __init__(self):
self.root = Node()
def insert(self, words):
for word in words:
cur = self.root
for w in word:
if w not in cur.child:
cur.child[w] = Node()
cur = cur.child[w]
cur.word = True
def search(self, word):
cur = self.root
for w in word:
if w not in cur.child:
return False
cur = cur.child[w]
if cur.word == False:
return False
return True
if __name__ == '__main__':
# 初始化糾正器
c = corrector()
# 獲得單詞
words = c.getData()
# 初始化前綴樹
Tree = Trie()
# 將所有的單詞都插入到前綴樹中
Tree.insert(words)
# 測試
print(c.getCorrection(['專塘街道','轉(zhuǎn)塘姐道','轉(zhuǎn)塘街到']))
到此這篇關(guān)于Python中文糾錯的簡單實現(xiàn)的文章就介紹到這了,更多相關(guān)Python中文糾錯內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!