fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
198 lines
5.6 KiB
Python
198 lines
5.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Stopword extraction and stopword classes.
|
|
"""
|
|
__title__ = 'newspaper'
|
|
__author__ = 'Lucas Ou-Yang'
|
|
__license__ = 'MIT'
|
|
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
|
|
|
|
import os
|
|
import re
|
|
import string
|
|
|
|
from .utils import FileHelper
|
|
|
|
TABSSPACE = re.compile(r'[\s\t]+')
|
|
|
|
|
|
def innerTrim(value):
|
|
if isinstance(value, str):
|
|
# remove tab and white space
|
|
value = re.sub(TABSSPACE, ' ', value)
|
|
value = ''.join(value.splitlines())
|
|
return value.strip()
|
|
return ''
|
|
|
|
|
|
class WordStats(object):
|
|
|
|
def __init__(self):
|
|
# total number of stopwords or good words we calc
|
|
self.stop_word_count = 0
|
|
|
|
# total number of words on a node
|
|
self.word_count = 0
|
|
|
|
# holds an actual list of stop words we have
|
|
self.stop_words = []
|
|
|
|
def get_stop_words(self):
|
|
return self.stop_words
|
|
|
|
def set_stop_words(self, words):
|
|
self.stop_words = words
|
|
|
|
def get_stopword_count(self):
|
|
return self.stop_word_count
|
|
|
|
def set_stopword_count(self, wordcount):
|
|
self.stop_word_count = wordcount
|
|
|
|
def get_word_count(self):
|
|
return self.word_count
|
|
|
|
def set_word_count(self, cnt):
|
|
self.word_count = cnt
|
|
|
|
|
|
class StopWords(object):
|
|
|
|
TRANS_TABLE = str.maketrans('', '')
|
|
_cached_stop_words = {}
|
|
|
|
def __init__(self, language='en'):
|
|
if language not in self._cached_stop_words:
|
|
path = os.path.join('text', 'stopwords-%s.txt' % language)
|
|
self._cached_stop_words[language] = \
|
|
set(FileHelper.loadResourceFile(path).splitlines())
|
|
self.STOP_WORDS = self._cached_stop_words[language]
|
|
|
|
def remove_punctuation(self, content):
|
|
# code taken form
|
|
# http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
|
|
content_is_unicode = isinstance(content, str)
|
|
if content_is_unicode:
|
|
content = content.encode('utf-8')
|
|
trans_table = {ord(c): None for c in string.punctuation}
|
|
stripped_input = content.decode('utf-8').translate(trans_table)
|
|
|
|
return stripped_input
|
|
|
|
def candidate_words(self, stripped_input):
|
|
return stripped_input.split(' ')
|
|
|
|
def get_stopword_count(self, content):
|
|
if not content:
|
|
return WordStats()
|
|
ws = WordStats()
|
|
stripped_input = self.remove_punctuation(content)
|
|
candidate_words = self.candidate_words(stripped_input.lower())
|
|
overlapping_stopwords = []
|
|
c = 0
|
|
for w in candidate_words:
|
|
c += 1
|
|
if w in self.STOP_WORDS:
|
|
overlapping_stopwords.append(w)
|
|
|
|
ws.set_word_count(c)
|
|
ws.set_stopword_count(len(overlapping_stopwords))
|
|
ws.set_stop_words(overlapping_stopwords)
|
|
return ws
|
|
|
|
|
|
class StopWordsChinese(StopWords):
|
|
"""Chinese segmentation
|
|
"""
|
|
def __init__(self, language='zh'):
|
|
super(StopWordsChinese, self).__init__(language='zh')
|
|
|
|
def candidate_words(self, stripped_input):
|
|
# jieba builds a tree that takes a while. avoid building
|
|
# this tree if we don't use the chinese language
|
|
import jieba
|
|
return jieba.cut(stripped_input, cut_all=True)
|
|
|
|
|
|
class StopWordsArabic(StopWords):
|
|
"""Arabic segmentation
|
|
"""
|
|
def __init__(self, language='ar'):
|
|
# force ar languahe code
|
|
super(StopWordsArabic, self).__init__(language='ar')
|
|
|
|
def remove_punctuation(self, content):
|
|
return content
|
|
|
|
def candidate_words(self, stripped_input):
|
|
import nltk
|
|
s = nltk.stem.isri.ISRIStemmer()
|
|
words = []
|
|
for word in nltk.tokenize.wordpunct_tokenize(stripped_input):
|
|
words.append(s.stem(word))
|
|
return words
|
|
|
|
|
|
class StopWordsKorean(StopWords):
|
|
"""Korean segmentation
|
|
"""
|
|
def __init__(self, language='ko'):
|
|
super(StopWordsKorean, self).__init__(language='ko')
|
|
|
|
def get_stopword_count(self, content):
|
|
if not content:
|
|
return WordStats()
|
|
ws = WordStats()
|
|
stripped_input = self.remove_punctuation(content)
|
|
candidate_words = self.candidate_words(stripped_input)
|
|
overlapping_stopwords = []
|
|
c = 0
|
|
for w in candidate_words:
|
|
c += 1
|
|
for s in self.STOP_WORDS:
|
|
if w.endswith(s):
|
|
overlapping_stopwords.append(w)
|
|
|
|
ws.set_word_count(c)
|
|
ws.set_stopword_count(len(overlapping_stopwords))
|
|
ws.set_stop_words(overlapping_stopwords)
|
|
return ws
|
|
|
|
|
|
class StopWordsHindi(StopWords):
|
|
"""Hindi segmentation
|
|
"""
|
|
def __init__(self, language='hi'):
|
|
super(StopWordsHindi, self).__init__(language='hi')
|
|
|
|
def get_stopword_count(self, content):
|
|
if not content:
|
|
return WordStats()
|
|
ws = WordStats()
|
|
stripped_input = self.remove_punctuation(content)
|
|
candidate_words = self.candidate_words(stripped_input)
|
|
overlapping_stopwords = []
|
|
c = 0
|
|
for w in candidate_words:
|
|
c += 1
|
|
for stop_word in self.STOP_WORDS:
|
|
overlapping_stopwords.append(stop_word)
|
|
|
|
ws.set_word_count(c)
|
|
ws.set_stopword_count(len(overlapping_stopwords))
|
|
ws.set_stop_words(overlapping_stopwords)
|
|
return ws
|
|
|
|
|
|
class StopWordsJapanese(StopWords):
|
|
"""Japanese segmentation
|
|
"""
|
|
def __init__(self, language='ja'):
|
|
super(StopWordsJapanese, self).__init__(language='ja')
|
|
|
|
def candidate_words(self, stripped_input):
|
|
import tinysegmenter
|
|
segmenter = tinysegmenter.TinySegmenter()
|
|
tokens = segmenter.tokenize(stripped_input)
|
|
return tokens
|