Files
MoFin/venv/lib/python3.12/site-packages/newspaper/nlp.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

209 lines
5.8 KiB
Python

# -*- coding: utf-8 -*-
"""
Anything natural language related should be abstracted into this file.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import re
import math
from os import path
from collections import Counter
from . import settings
ideal = 20.0
stopwords = set()
def load_stopwords(language):
"""
Loads language-specific stopwords for keyword selection
"""
global stopwords
# stopwords for nlp in English are not the regular stopwords
# to pass the tests
# can be changed with the tests
if language == 'en':
stopwordsFile = settings.NLP_STOPWORDS_EN
else:
stopwordsFile = path.join(settings.STOPWORDS_DIR,\
'stopwords-{}.txt'.format(language))
with open(stopwordsFile, 'r', encoding='utf-8') as f:
stopwords.update(set([w.strip() for w in f.readlines()]))
def summarize(url='', title='', text='', max_sents=5):
if not text or not title or max_sents <= 0:
return []
summaries = []
sentences = split_sentences(text)
keys = keywords(text)
titleWords = split_words(title)
# Score sentences, and use the top 5 or max_sents sentences
ranks = score(sentences, titleWords, keys).most_common(max_sents)
for rank in ranks:
summaries.append(rank[0])
summaries.sort(key=lambda summary: summary[0])
return [summary[1] for summary in summaries]
def score(sentences, titleWords, keywords):
"""Score sentences based on different features
"""
senSize = len(sentences)
ranks = Counter()
for i, s in enumerate(sentences):
sentence = split_words(s)
titleFeature = title_score(titleWords, sentence)
sentenceLength = length_score(len(sentence))
sentencePosition = sentence_position(i + 1, senSize)
sbsFeature = sbs(sentence, keywords)
dbsFeature = dbs(sentence, keywords)
frequency = (sbsFeature + dbsFeature) / 2.0 * 10.0
# Weighted average of scores from four categories
totalScore = (titleFeature*1.5 + frequency*2.0 +
sentenceLength*1.0 + sentencePosition*1.0)/4.0
ranks[(i, s)] = totalScore
return ranks
def sbs(words, keywords):
score = 0.0
if (len(words) == 0):
return 0
for word in words:
if word in keywords:
score += keywords[word]
return (1.0 / math.fabs(len(words)) * score) / 10.0
def dbs(words, keywords):
if (len(words) == 0):
return 0
summ = 0
first = []
second = []
for i, word in enumerate(words):
if word in keywords:
score = keywords[word]
if first == []:
first = [i, score]
else:
second = first
first = [i, score]
dif = first[0] - second[0]
summ += (first[1] * second[1]) / (dif ** 2)
# Number of intersections
k = len(set(keywords.keys()).intersection(set(words))) + 1
return (1 / (k * (k + 1.0)) * summ)
def split_words(text):
"""Split a string into array of words
"""
try:
text = re.sub(r'[^\w ]', '', text) # strip special chars
return [x.strip('.').lower() for x in text.split()]
except TypeError:
return None
def keywords(text):
"""Get the top 10 keywords and their frequency scores ignores blacklisted
words in stopwords, counts the number of occurrences of each word, and
sorts them in reverse natural order (so descending) by number of
occurrences.
"""
NUM_KEYWORDS = 10
text = split_words(text)
# of words before removing blacklist words
if text:
num_words = len(text)
text = [x for x in text if x not in stopwords]
freq = {}
for word in text:
if word in freq:
freq[word] += 1
else:
freq[word] = 1
min_size = min(NUM_KEYWORDS, len(freq))
keywords = sorted(freq.items(),
key=lambda x: (x[1], x[0]),
reverse=True)
keywords = keywords[:min_size]
keywords = dict((x, y) for x, y in keywords)
for k in keywords:
articleScore = keywords[k] * 1.0 / max(num_words, 1)
keywords[k] = articleScore * 1.5 + 1
return dict(keywords)
else:
return dict()
def split_sentences(text):
"""Split a large string into sentences
"""
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(text)
sentences = [x.replace('\n', '') for x in sentences if len(x) > 10]
return sentences
def length_score(sentence_len):
return 1 - math.fabs(ideal - sentence_len) / ideal
def title_score(title, sentence):
if title:
title = [x for x in title if x not in stopwords]
count = 0.0
for word in sentence:
if (word not in stopwords and word in title):
count += 1.0
return count / max(len(title), 1)
else:
return 0
def sentence_position(i, size):
"""Different sentence positions indicate different
probability of being an important sentence.
"""
normalized = i * 1.0 / size
if (normalized > 1.0):
return 0
elif (normalized > 0.9):
return 0.15
elif (normalized > 0.8):
return 0.04
elif (normalized > 0.7):
return 0.04
elif (normalized > 0.6):
return 0.06
elif (normalized > 0.5):
return 0.04
elif (normalized > 0.4):
return 0.05
elif (normalized > 0.3):
return 0.08
elif (normalized > 0.2):
return 0.14
elif (normalized > 0.1):
return 0.23
elif (normalized > 0):
return 0.17
else:
return 0