Files
MoFin/venv/lib/python3.12/site-packages/tinysegmenter.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

192 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# TinySegmenter -- Super compact Japanese tokenizer in Javascript
# (c) 2008 Taku Kudo <taku@chasen.org>
# "TinySegmenter in Python" is originally written by:
# (c) 2010 Masato Hagiwara
# See http://lilyx.net/pages/tinysegmenterp.html
# Maintained by:
# (c) 2012 Jehan
# TinySegmenter is freely distributable under the terms of a new BSD licence.
# See the COPYING file included with the distribution.
import re
class TinySegmenter():
def __init__(self):
self._patterns = {
# Ideogram Numbers.
u"[一二三四五六七八九十百千万億兆]":u"M",
# All other unified CJC ideograms.
u"[一-龠々〆ヵヶ]":u"H",
# All hiragana, small and big.
u"[ぁ-ん]":u"I",
# All katakana, except punctuation (U+30A0) and halfwidth katakana variants.
u"[ァ-ヺーヲ-゙]":u"K",
# ASCII letters (normal and fullwidth) and Latin1 supplement's letters, then extended Latin A.
u"[a-zA-Z-zA-ZÀ-ÖØ-öø-ÿĀ-ſ]":u"A",
# Normal and Fullwidth Arabic numbers (digits).
u"[0-9-]":u"N"
}
self._chartype = []
for pat in self._patterns:
regexp = re.compile(pat)
self._chartype.append([regexp, self._patterns[pat]])
self._BIAS = -332
self._BC1 = {u"HH":6,u"II":2461,u"KH":406,u"OH":-1378}
self._BC2 = {u"AA":-3267,u"AI":2744,u"AN":-878,u"HH":-4070,u"HM":-1711,u"HN":4012,u"HO":3761,u"IA":1327,u"IH":-1184,u"II":-1332,u"IK":1721,u"IO":5492,u"KI":3831,u"KK":-8741,u"MH":-3132,u"MK":3334,u"OO":-2920}
self._BC3 = {u"HH":996,u"HI":626,u"HK":-721,u"HN":-1307,u"HO":-836,u"IH":-301,u"KK":2762,u"MK":1079,u"MM":4034,u"OA":-1652,u"OH":266}
self._BP1 = {u"BB":295,u"OB":304,u"OO":-125,u"UB":352}
self._BP2 = {u"BO":60,u"OO":-1762}
self._BQ1 = {u"BHH":1150,u"BHM":1521,u"BII":-1158,u"BIM":886,u"BMH":1208,u"BNH":449,u"BOH":-91,u"BOO":-2597,u"OHI":451,u"OIH":-296,u"OKA":1851,u"OKH":-1020,u"OKK":904,u"OOO":2965}
self._BQ2 = {u"BHH":118,u"BHI":-1159,u"BHM":466,u"BIH":-919,u"BKK":-1720,u"BKO":864,u"OHH":-1139,u"OHM":-181,u"OIH":153,u"UHI":-1146}
self._BQ3 = {u"BHH":-792,u"BHI":2664,u"BII":-299,u"BKI":419,u"BMH":937,u"BMM":8335,u"BNN":998,u"BOH":775,u"OHH":2174,u"OHM":439,u"OII":280,u"OKH":1798,u"OKI":-793,u"OKO":-2242,u"OMH":-2402,u"OOO":11699}
self._BQ4 = {u"BHH":-3895,u"BIH":3761,u"BII":-4654,u"BIK":1348,u"BKK":-1806,u"BMI":-3385,u"BOO":-12396,u"OAH":926,u"OHH":266,u"OHK":-2036,u"ONN":-973}
self._BW1 = {u",と":660,u",同":727,u"B1あ":1404,u"B1同":542,u"、と":660,u"、同":727,u"」と":1682,u"あっ":1505,u"いう":1743,u"いっ":-2055,u"いる":672,u"うし":-4817,u"うん":665,u"から":3472,u"がら":600,u"こう":-790,u"こと":2083,u"こん":-1262,u"さら":-4143,u"さん":4573,u"した":2641,u"して":1104,u"すで":-3399,u"そこ":1977,u"それ":-871,u"たち":1122,u"ため":601,u"った":3463,u"つい":-802,u"てい":805,u"てき":1249,u"でき":1127,u"です":3445,u"では":844,u"とい":-4915,u"とみ":1922,u"どこ":3887,u"ない":5713,u"なっ":3015,u"など":7379,u"なん":-1113,u"にし":2468,u"には":1498,u"にも":1671,u"に対":-912,u"の一":-501,u"の中":741,u"ませ":2448,u"まで":1711,u"まま":2600,u"まる":-2155,u"やむ":-1947,u"よっ":-2565,u"れた":2369,u"れで":-913,u"をし":1860,u"を見":731,u"亡く":-1886,u"京都":2558,u"取り":-2784,u"大き":-2604,u"大阪":1497,u"平方":-2314,u"引き":-1336,u"日本":-195,u"本当":-2423,u"毎日":-2113,u"目指":-724,u"B1あ":1404,u"B1同":542,u"」と":1682}
self._BW2 = {u"..":-11822,u"11":-669,u"――":-5730,u"−−":-13175,u"いう":-1609,u"うか":2490,u"かし":-1350,u"かも":-602,u"から":-7194,u"かれ":4612,u"がい":853,u"がら":-3198,u"きた":1941,u"くな":-1597,u"こと":-8392,u"この":-4193,u"させ":4533,u"され":13168,u"さん":-3977,u"しい":-1819,u"しか":-545,u"した":5078,u"して":972,u"しな":939,u"その":-3744,u"たい":-1253,u"たた":-662,u"ただ":-3857,u"たち":-786,u"たと":1224,u"たは":-939,u"った":4589,u"って":1647,u"っと":-2094,u"てい":6144,u"てき":3640,u"てく":2551,u"ては":-3110,u"ても":-3065,u"でい":2666,u"でき":-1528,u"でし":-3828,u"です":-4761,u"でも":-4203,u"とい":1890,u"とこ":-1746,u"とと":-2279,u"との":720,u"とみ":5168,u"とも":-3941,u"ない":-2488,u"なが":-1313,u"など":-6509,u"なの":2614,u"なん":3099,u"にお":-1615,u"にし":2748,u"にな":2454,u"によ":-7236,u"に対":-14943,u"に従":-4688,u"に関":-11388,u"のか":2093,u"ので":-7059,u"のに":-6041,u"のの":-6125,u"はい":1073,u"はが":-1033,u"はず":-2532,u"ばれ":1813,u"まし":-1316,u"まで":-6621,u"まれ":5409,u"めて":-3153,u"もい":2230,u"もの":-10713,u"らか":-944,u"らし":-1611,u"らに":-1897,u"りし":651,u"りま":1620,u"れた":4270,u"れて":849,u"れば":4114,u"ろう":6067,u"われ":7901,u"を通":-11877,u"んだ":728,u"んな":-4115,u"一人":602,u"一方":-1375,u"一日":970,u"一部":-1051,u"上が":-4479,u"会社":-1116,u"出て":2163,u"分の":-7758,u"同党":970,u"同日":-913,u"大阪":-2471,u"委員":-1250,u"少な":-1050,u"年度":-8669,u"年間":-1626,u"府県":-2363,u"手権":-1982,u"新聞":-4066,u"日新":-722,u"日本":-7068,u"日米":3372,u"曜日":-601,u"朝鮮":-2355,u"本人":-2697,u"東京":-1543,u"然と":-1384,u"社会":-1276,u"立て":-990,u"第に":-1612,u"米国":-4268,u"11":-669};
self._BW3 = {u"あた":-2194,u"あり":719,u"ある":3846,u"い.":-1185,u"い。":-1185,u"いい":5308,u"いえ":2079,u"いく":3029,u"いた":2056,u"いっ":1883,u"いる":5600,u"いわ":1527,u"うち":1117,u"うと":4798,u"えと":1454,u"か.":2857,u"か。":2857,u"かけ":-743,u"かっ":-4098,u"かに":-669,u"から":6520,u"かり":-2670,u"が,u":1816,u"が、":1816,u"がき":-4855,u"がけ":-1127,u"がっ":-913,u"がら":-4977,u"がり":-2064,u"きた":1645,u"けど":1374,u"こと":7397,u"この":1542,u"ころ":-2757,u"さい":-714,u"さを":976,u"し,u":1557,u"し、":1557,u"しい":-3714,u"した":3562,u"して":1449,u"しな":2608,u"しま":1200,u"す.":-1310,u"す。":-1310,u"する":6521,u"ず,u":3426,u"ず、":3426,u"ずに":841,u"そう":428,u"た.":8875,u"た。":8875,u"たい":-594,u"たの":812,u"たり":-1183,u"たる":-853,u"だ.":4098,u"だ。":4098,u"だっ":1004,u"った":-4748,u"って":300,u"てい":6240,u"てお":855,u"ても":302,u"です":1437,u"でに":-1482,u"では":2295,u"とう":-1387,u"とし":2266,u"との":541,u"とも":-3543,u"どう":4664,u"ない":1796,u"なく":-903,u"など":2135,u"に,u":-1021,u"に、":-1021,u"にし":1771,u"にな":1906,u"には":2644,u"の,u":-724,u"の、":-724,u"の子":-1000,u"は,u":1337,u"は、":1337,u"べき":2181,u"まし":1113,u"ます":6943,u"まっ":-1549,u"まで":6154,u"まれ":-793,u"らし":1479,u"られ":6820,u"るる":3818,u"れ,u":854,u"れ、":854,u"れた":1850,u"れて":1375,u"れば":-3246,u"れる":1091,u"われ":-605,u"んだ":606,u"んで":798,u"カ月":990,u"会議":860,u"入り":1232,u"大会":2217,u"始め":1681,u"":965,u"新聞":-5055,u"日,u":974,u"日、":974,u"社会":2024,u"カ月":990};
self._TC1 = {u"AAA":1093,u"HHH":1029,u"HHM":580,u"HII":998,u"HOH":-390,u"HOM":-331,u"IHI":1169,u"IOH":-142,u"IOI":-1015,u"IOM":467,u"MMH":187,u"OOI":-1832};
self._TC2 = {u"HHO":2088,u"HII":-1023,u"HMM":-1154,u"IHI":-1965,u"KKH":703,u"OII":-2649};
self._TC3 = {u"AAA":-294,u"HHH":346,u"HHI":-341,u"HII":-1088,u"HIK":731,u"HOH":-1486,u"IHH":128,u"IHI":-3041,u"IHO":-1935,u"IIH":-825,u"IIM":-1035,u"IOI":-542,u"KHH":-1216,u"KKA":491,u"KKH":-1217,u"KOK":-1009,u"MHH":-2694,u"MHM":-457,u"MHO":123,u"MMH":-471,u"NNH":-1689,u"NNO":662,u"OHO":-3393};
self._TC4 = {u"HHH":-203,u"HHI":1344,u"HHK":365,u"HHM":-122,u"HHN":182,u"HHO":669,u"HIH":804,u"HII":679,u"HOH":446,u"IHH":695,u"IHO":-2324,u"IIH":321,u"III":1497,u"IIO":656,u"IOO":54,u"KAK":4845,u"KKA":3386,u"KKK":3065,u"MHH":-405,u"MHI":201,u"MMH":-241,u"MMM":661,u"MOM":841};
self._TQ1 = {u"BHHH":-227,u"BHHI":316,u"BHIH":-132,u"BIHH":60,u"BIII":1595,u"BNHH":-744,u"BOHH":225,u"BOOO":-908,u"OAKK":482,u"OHHH":281,u"OHIH":249,u"OIHI":200,u"OIIH":-68};
self._TQ2 = {u"BIHH":-1401,u"BIII":-1033,u"BKAK":-543,u"BOOO":-5591};
self._TQ3 = {u"BHHH":478,u"BHHM":-1073,u"BHIH":222,u"BHII":-504,u"BIIH":-116,u"BIII":-105,u"BMHI":-863,u"BMHM":-464,u"BOMH":620,u"OHHH":346,u"OHHI":1729,u"OHII":997,u"OHMH":481,u"OIHH":623,u"OIIH":1344,u"OKAK":2792,u"OKHH":587,u"OKKA":679,u"OOHH":110,u"OOII":-685};
self._TQ4 = {u"BHHH":-721,u"BHHM":-3604,u"BHII":-966,u"BIIH":-607,u"BIII":-2181,u"OAAA":-2763,u"OAKK":180,u"OHHH":-294,u"OHHI":2446,u"OHHO":480,u"OHIH":-1573,u"OIHH":1935,u"OIHI":-493,u"OIIH":626,u"OIII":-4007,u"OKAK":-8156};
self._TW1 = {u"につい":-4681,u"東京都":2026};
self._TW2 = {u"ある程":-2049,u"いった":-1256,u"ころが":-2434,u"しょう":3873,u"その後":-4430,u"だって":-1049,u"ていた":1833,u"として":-4657,u"ともに":-4517,u"もので":1882,u"一気に":-792,u"初めて":-1512,u"同時に":-8097,u"大きな":-1255,u"対して":-2721,u"社会党":-3216};
self._TW3 = {u"いただ":-1734,u"してい":1314,u"として":-4314,u"につい":-5483,u"にとっ":-5989,u"に当た":-6247,u"ので,u":-727,u"ので、":-727,u"のもの":-600,u"れから":-3752,u"十二月":-2287};
self._TW4 = {u"いう.":8576,u"いう。":8576,u"からな":-2348,u"してい":2958,u"たが,u":1516,u"たが、":1516,u"ている":1538,u"という":1349,u"ました":5543,u"ません":1097,u"ようと":-4258,u"よると":5865};
self._UC1 = {u"A":484,u"K":93,u"M":645,u"O":-505};
self._UC2 = {u"A":819,u"H":1059,u"I":409,u"M":3987,u"N":5775,u"O":646};
self._UC3 = {u"A":-1370,u"I":2311};
self._UC4 = {u"A":-2643,u"H":1809,u"I":-1032,u"K":-3450,u"M":3565,u"N":3876,u"O":6646};
self._UC5 = {u"H":313,u"I":-1238,u"K":-799,u"M":539,u"O":-831};
self._UC6 = {u"H":-506,u"I":-253,u"K":87,u"M":247,u"O":-387};
self._UP1 = {u"O":-214};
self._UP2 = {u"B":69,u"O":935};
self._UP3 = {u"B":189};
self._UQ1 = {u"BH":21,u"BI":-12,u"BK":-99,u"BN":142,u"BO":-56,u"OH":-95,u"OI":477,u"OK":410,u"OO":-2422};
self._UQ2 = {u"BH":216,u"BI":113,u"OK":1759};
self._UQ3 = {u"BA":-479,u"BH":42,u"BI":1913,u"BK":-7198,u"BM":3160,u"BN":6427,u"BO":14761,u"OI":-827,u"ON":-3212};
self._UW1 = {u",u":156,u"":156,u"":-463,u"":-941,u"":-127,u"":-553,u"":121,u"":505,u"":-201,u"":-547,u"":-123,u"":-789,u"":-185,u"":-847,u"":-466,u"":-470,u"":182,u"":-292,u"":208,u"":169,u"":-446,u"":-137,u"":-135,u"":-402,u"":-268,u"":-912,u"":871,u"":-460,u"":561,u"":729,u"":-411,u"":-141,u"":361,u"":-408,u"":-386,u"":-718,u"":-463,u"":-135};
self._UW2 = {u",u":-829,u"":-829,u"":892,u"":-645,u"":3145,u"":-538,u"":505,u"":134,u"":-502,u"":1454,u"":-856,u"":-412,u"":1141,u"":878,u"":540,u"":1529,u"":-675,u"":300,u"":-1011,u"":188,u"":1837,u"":-949,u"":-291,u"":-268,u"":-981,u"":1273,u"":1063,u"":-1764,u"":130,u"":-409,u"":-1273,u"":1261,u"":600,u"":-1263,u"":-402,u"":1639,u"":-579,u"":-694,u"":571,u"":-2516,u"":2095,u"":-587,u"":306,u"":568,u"":831,u"":-758,u"":-2150,u"":-302,u"":-968,u"":-861,u"":492,u"":-123,u"":978,u"":362,u"":548,u"":-3025,u"":-1566,u"":-3414,u"":-422,u"":-1769,u"":-865,u"":-483,u"":-1519,u"":760,u"":1023,u"":-2009,u"":-813,u"":-1060,u"":1067,u"":-1519,u"":-1033,u"":1522,u"":-1355,u"":-1682,u"":-1815,u"":-1462,u"":-630,u"":-1843,u"":-1650,u"":-931,u"":-665,u"":-2378,u"":-180,u"":-1740,u"":752,u"":529,u"":-1584,u"":-242,u"":-1165,u"":-763,u"":810,u"":509,u"":-1353,u"":838,u"西":-744,u"":-3874,u"調":1010,u"":1198,u"":3041,u"":1758,u"":-1257,u"":-645,u"":3145,u"":831,u"":-587,u"":306,u"":568};
self._UW3 = {u",u":4889,u"1":-800,u"":-1723,u"":4889,u"":-2311,u"":5827,u"":2670,u"":-3573,u"":-2696,u"":1006,u"":2342,u"":1983,u"":-4864,u"":-1163,u"":3271,u"":1004,u"":388,u"":401,u"":-3552,u"":-3116,u"":-1058,u"":-395,u"":584,u"":3685,u"":-5228,u"":842,u"":-521,u"":-1444,u"":-1081,u"":6167,u"":2318,u"":1691,u"":-899,u"":-2788,u"":2745,u"":4056,u"":4555,u"":-2171,u"":-1798,u"":1199,u"":-5516,u"":-4384,u"":-120,u"":1205,u"":2323,u"":-788,u"":-202,u"":727,u"":649,u"":5905,u"":2773,u"":-1207,u"":6620,u"":-518,u"":551,u"":1319,u"":874,u"":-1350,u"":521,u"":1109,u"":1591,u"":2201,u"":278,u"":-3794,u"":-1619,u"":-1759,u"":-2087,u"":3815,u"":653,u"":-758,u"":-1193,u"":974,u"":2742,u"":792,u"":1889,u"":-1368,u"":811,u"":4265,u"":-361,u"":-2439,u"":4858,u"":3593,u"":1574,u"":-3030,u"":755,u"":-1880,u"":5807,u"":3095,u"":457,u"":2475,u"":1129,u"":2286,u"":4437,u"":365,u"":-949,u"":-1872,u"":1327,u"":-1038,u"":4646,u"":-2309,u"":-783,u"":-1006,u"":483,u"":1233,u"":3588,u"":-241,u"":3906,u"":-837,u"":4513,u"":642,u"":1389,u"":1219,u"":-241,u"":2016,u"":-1356,u"":-423,u"":-1008,u"":1078,u"":-513,u"":-3102,u"":1155,u"":3197,u"":-1804,u"":2416,u"":-1030,u"":1605,u"":1452,u"":-2352,u"":-3885,u"":1905,u"":-1291,u"":1822,u"":-488,u"":-3973,u"":-2013,u"":-1479,u"":3222,u"":-1489,u"":1764,u"":2099,u"":5792,u"":-661,u"":-1248,u"":-951,u"":-937,u"":4125,u"":360,u"":3094,u"":364,u"":-805,u"":5156,u"":2438,u"":484,u"":2613,u"":-1694,u"":-1073,u"":1868,u"":-495,u"":979,u"":461,u"":-3850,u"":-273,u"":914,u"":1215,u"":7313,u"":-1835,u"":792,u"":6293,u"":-1528,u"":4231,u"":401,u"":-960,u"":1201,u"":7767,u"":3066,u"":3663,u"":1384,u"":-4229,u"":1163,u"":1255,u"":6457,u"":725,u"":-2869,u"":785,u"":1044,u"調":-562,u"":-733,u"":1777,u"":1835,u"":1375,u"":-1504,u"":-1136,u"":-681,u"":1026,u"":4404,u"":1200,u"":2163,u"":421,u"":-1432,u"":1302,u"":-1282,u"":2009,u"":-1045,u"":2066,u"":1620,u"":-800,u"":2670,u"":-3794,u"":-1350,u"":551,u"グ":1319,u"":874,u"":521,u"":1109,u"":1591,u"":2201,u"":278};
self._UW4 = {u",u":3930,u".":3508,u"":-4841,u"":3930,u"":3508,u"":4999,u"":1895,u"":3798,u"":-5156,u"":4752,u"":-3435,u"":-640,u"":-2514,u"":2405,u"":530,u"":6006,u"":-4482,u"":-3821,u"":-3788,u"":-4376,u"":-4734,u"":2255,u"":1979,u"":2864,u"":-843,u"":-2506,u"":-731,u"":1251,u"":181,u"":4091,u"":5034,u"":5408,u"":-3654,u"":-5882,u"":-1659,u"":3994,u"":7410,u"":4547,u"":5433,u"":6499,u"":1853,u"":1413,u"":7396,u"":8578,u"":1940,u"":4249,u"":-4134,u"":1345,u"":6665,u"":-744,u"":1464,u"":1051,u"":-2082,u"":-882,u"":-5046,u"":4169,u"":-2666,u"":2795,u"":-1544,u"":3351,u"":-2922,u"":-9726,u"":-14896,u"":-2613,u"":-4570,u"":-1783,u"":13150,u"":-2352,u"":2145,u"":1789,u"":1287,u"":-724,u"":-403,u"":-1635,u"":-881,u"":-541,u"":-856,u"":-3637,u"":-4371,u"":-11870,u"":-2069,u"":2210,u"":782,u"":-190,u"":-1768,u"":1036,u"":544,u"":950,u"":-1286,u"":530,u"":4292,u"":601,u"":-2006,u"":-1212,u"":584,u"":788,u"":1347,u"":1623,u"":3879,u"":-302,u"":-740,u"":-2715,u"":776,u"":4517,u"":1013,u"":1555,u"":-1834,u"":-681,u"":-910,u"":-851,u"":1500,u"":-619,u"":-1200,u"":866,u"":-1410,u"":-2094,u"":-1413,u"":1067,u"":571,u"":-4802,u"":-1397,u"":-1057,u"":-809,u"":1910,u"":-1328,u"":-1500,u"":-2056,u"":-2667,u"":2771,u"":374,u"":-4556,u"":456,u"":553,u"":916,u"":-1566,u"":856,u"":787,u"":2182,u"":704,u"":522,u"":-856,u"":1798,u"":1829,u"":845,u"":-9066,u"":-485,u"":-442,u"":-360,u"":-1043,u"":5388,u"":-2716,u"":-910,u"":-939,u"":-543,u"":-735,u"":672,u"":-1267,u"":-1286,u"":-1101,u"":-2900,u"":1826,u"":2586,u"":922,u"":-3485,u"":2997,u"":-867,u"":-2112,u"":788,u"":2937,u"":786,u"":2171,u"":1146,u"":-1169,u"":940,u"":-994,u"":749,u"":2145,u"":-730,u"":-852,u"":-792,u"":792,u"":-1184,u"":-244,u"":-1000,u"":730,u"":-1481,u"":1158,u"":-1433,u"":-3370,u"":929,u"":-1291,u"":2596,u"":-4866,u"":1192,u"":-1100,u"":-2213,u"":357,u"":-2344,u"":-2297,u"":-2604,u"":-878,u"":-1659,u"":-792,u"":-1984,u"":1749,u"":2120,u"":1895,u"":3798,u"":-4371,u"":-724,u"":-11870,u"":2145,u"":1789,u"":1287,u"":-403,u"":-1635,u"":-881,u"":-541,u"":-856,u"":-3637};
self._UW5 = {u",u":465,u".":-299,u"1":-514,u"E2":-32768,u"]":-2762,u"":465,u"":-299,u"":363,u"":1655,u"":331,u"":-503,u"":1199,u"":527,u"":647,u"":-421,u"":1624,u"":1971,u"":312,u"":-983,u"":-1537,u"":-1371,u"":-852,u"":-1186,u"":1093,u"":52,u"":921,u"":-18,u"":-850,u"":-127,u"":1682,u"":-787,u"":-1224,u"":-635,u"":-578,u"":1001,u"":502,u"":865,u"":3350,u"":854,u"":-208,u"":429,u"":504,u"":419,u"":-1264,u"":327,u"":241,u"":451,u"":-343,u"":-871,u"":722,u"":-1153,u"":-654,u"":3519,u"":-901,u"":848,u"":2104,u"":-1296,u"":-548,u"":1785,u"":-1304,u"":-2991,u"":921,u"":1763,u"":872,u"":-814,u"":1618,u"":-1682,u"":218,u"":-4353,u"":932,u"":1356,u"":-1508,u"":-1347,u"":240,u"":-3912,u"":-3149,u"":1319,u"":-1052,u"":-4003,u"":-997,u"":-278,u"":-813,u"":1955,u"":-2233,u"":663,u"":-1073,u"":1219,u"":-1018,u"":-368,u"":786,u"":1191,u"":2368,u"":-689,u"":-514,u"E2":-32768,u"":363,u"":241,u"":451,u"":-343};
self._UW6 = {u",u":227,u".":808,u"1":-270,u"E1":306,u"":227,u"":808,u"":-307,u"":189,u"":241,u"":-73,u"":-121,u"":-200,u"":1782,u"":383,u"":-428,u"":573,u"":-1014,u"":101,u"":-105,u"":-253,u"":-149,u"":-417,u"":-236,u"":-206,u"":187,u"":-135,u"":195,u"":-673,u"":-496,u"":-277,u"":201,u"":-800,u"":624,u"":302,u"":1792,u"":-1212,u"":798,u"":-960,u"":887,u"":-695,u"":535,u"":-697,u"":753,u"":-507,u"":974,u"":-822,u"":1811,u"":463,u"":1082,u"":-270,u"E1":306,u"":-673,u"":-496};
def _ts(self, dict, key):
if not key in dict:
return 0
else:
return dict[key]
def _ctype(self, str):
for elem in self._chartype:
match = re.match(elem[0], str)
if match:
return elem[1]
return u"O"
def tokenize(self, text):
if text == "":
return []
result = []
seg = [u"B3",u"B2",u"B1"]
ctype = [u"O",u"O",u"O"]
o = list(text)
for i in range(0, len(o)):
seg.append(o[i])
ctype.append(self._ctype(o[i]))
seg.append(u"E1")
seg.append(u"E2")
seg.append(u"E3")
ctype.append(u"O")
ctype.append(u"O")
ctype.append(u"O")
word = seg[3]
p1 = u"U"
p2 = u"U"
p3 = u"U"
for i in range(4, len(seg) - 3):
score = self._BIAS
w1 = seg[i-3]
w2 = seg[i-2]
w3 = seg[i-1]
w4 = seg[i]
w5 = seg[i+1]
w6 = seg[i+2]
c1 = ctype[i-3]
c2 = ctype[i-2]
c3 = ctype[i-1]
c4 = ctype[i]
c5 = ctype[i+1]
c6 = ctype[i+2]
score += self._ts(self._UP1, p1)
score += self._ts(self._UP2, p2)
score += self._ts(self._UP3, p3)
score += self._ts(self._BP1, p1 + p2)
score += self._ts(self._BP2, p2 + p3)
score += self._ts(self._UW1, w1)
score += self._ts(self._UW2, w2)
score += self._ts(self._UW3, w3)
score += self._ts(self._UW4, w4)
score += self._ts(self._UW5, w5)
score += self._ts(self._UW6, w6)
score += self._ts(self._BW1, w2 + w3)
score += self._ts(self._BW2, w3 + w4)
score += self._ts(self._BW3, w4 + w5)
score += self._ts(self._TW1, w1 + w2 + w3)
score += self._ts(self._TW2, w2 + w3 + w4)
score += self._ts(self._TW3, w3 + w4 + w5)
score += self._ts(self._TW4, w4 + w5 + w6)
score += self._ts(self._UC1, c1)
score += self._ts(self._UC2, c2)
score += self._ts(self._UC3, c3)
score += self._ts(self._UC4, c4)
score += self._ts(self._UC5, c5)
score += self._ts(self._UC6, c6)
score += self._ts(self._BC1, c2 + c3)
score += self._ts(self._BC2, c3 + c4)
score += self._ts(self._BC3, c4 + c5)
score += self._ts(self._TC1, c1 + c2 + c3)
score += self._ts(self._TC2, c2 + c3 + c4)
score += self._ts(self._TC3, c3 + c4 + c5)
score += self._ts(self._TC4, c4 + c5 + c6)
# score += self._ts(self._TC5, c4 + c5 + c6)
score += self._ts(self._UQ1, p1 + c1)
score += self._ts(self._UQ2, p2 + c2)
score += self._ts(self._UQ1, p3 + c3)
score += self._ts(self._BQ1, p2 + c2 + c3)
score += self._ts(self._BQ2, p2 + c3 + c4)
score += self._ts(self._BQ3, p3 + c2 + c3)
score += self._ts(self._BQ4, p3 + c3 + c4)
score += self._ts(self._TQ1, p2 + c1 + c2 + c3)
score += self._ts(self._TQ2, p2 + c2 + c3 + c4)
score += self._ts(self._TQ3, p3 + c1 + c2 + c3)
score += self._ts(self._TQ4, p3 + c2 + c3 + c4)
p = u"O"
if score > 0:
result.append(word)
word = ""
p = u"B"
p1 = p2
p2 = p3
p3 = p
word += seg[i]
result.append(word)
return result