Files
MoFin/venv/lib/python3.12/site-packages/jieba/posseg/__init__.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

244 lines
7.3 KiB
Python

import re
import os
from . import viterbi
import jieba
import sys
import marshal
from functools import wraps
default_encoding = sys.getfilesystemencoding()
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name, isJython=True):
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
line = line.decode("utf-8")
word, _, tag = line.split(" ")
result[word] = tag
if not isJython:
return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with open(abs_path, 'rb') as f:
state = marshal.load(f)
f.closed
return state, start_p, trans_p, emit_p, result
if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
else:
from . import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
def makesure_userdict_loaded(fn):
@wraps(fn)
def wrapped(*args,**kwargs):
if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab)
jieba.user_word_tag_tab = {}
return fn(*args,**kwargs)
return wrapped
class pair(object):
def __init__(self,word,flag):
self.word = word
self.flag = flag
def __unicode__(self):
return '%s/%s' % (self.word, self.flag)
def __repr__(self):
return self.__str__()
def __str__(self):
return self.__unicode__().encode(default_encoding)
def encode(self,arg):
return self.__unicode__().encode(arg)
def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0, 0
for i,char in enumerate(sentence):
pos = pos_list[i][0]
if pos == 'B':
begin = i
elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1
elif pos == 'S':
yield pair(char, pos_list[i][1])
next = i+1
if next < len(sentence):
yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence):
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if x:
if re_num.match(x):
yield pair(x, 'm')
elif re_eng.match(x):
yield pair(x, 'eng')
else:
yield pair(x, 'x')
def __cut_DAG_NO_HMM(sentence):
DAG = jieba.get_DAG(sentence)
route = {}
jieba.calc(sentence, DAG, 0, route=route)
x = 0
N = len(sentence)
buf = ''
re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word
x = y
else:
if buf:
yield pair(buf,'eng')
buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y
if buf:
yield pair(buf,'eng')
buf = ''
def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence)
route = {}
jieba.calc(sentence,DAG,0,route=route)
x = 0
buf = ''
N = len(sentence)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
if y-x == 1:
buf += l_word
else:
if buf:
if len(buf) == 1:
yield pair(buf, word_tag_tab.get(buf, 'x'))
buf = ''
else:
if (buf not in jieba.FREQ):
recognized = __cut_detail(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield pair(elem, word_tag_tab.get(elem, 'x'))
buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y
if buf:
if len(buf) == 1:
yield pair(buf, word_tag_tab.get(buf, 'x'))
elif (buf not in jieba.FREQ):
recognized = __cut_detail(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_internal(sentence, HMM=True):
if not isinstance(sentence, str):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
else:
__cut_blk = __cut_DAG_NO_HMM
for blk in blocks:
if re_han.match(blk):
for word in __cut_blk(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
yield pair(x, 'x')
else:
for xx in x:
if re_num.match(xx):
yield pair(xx, 'm')
elif re_eng.match(x):
yield pair(xx, 'eng')
else:
yield pair(xx, 'x')
def __lcut_internal(sentence):
return list(__cut_internal(sentence))
def __lcut_internal_no_hmm(sentence):
return list(__cut_internal(sentence,False))
@makesure_userdict_loaded
def cut(sentence, HMM=True):
if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
for w in __cut_internal(sentence, HMM=HMM):
yield w
else:
parts = re.compile('([\r\n]+)').split(sentence)
if HMM:
result = jieba.pool.map(__lcut_internal, parts)
else:
result = jieba.pool.map(__lcut_internal_no_hmm, parts)
for r in result:
for w in r:
yield w