# -*- coding: utf-8 -*- """ Newspaper uses much of python-goose's extraction code. View their license: https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt Keep all html page extraction code within this file. Abstract any lxml or soup parsing code in the parsers.py file! """ __title__ = 'newspaper' __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import copy import logging import re import re from collections import defaultdict from dateutil.parser import parse as date_parser from tldextract import tldextract from urllib.parse import urljoin, urlparse, urlunparse from . import urls from .utils import StringReplacement, StringSplitter log = logging.getLogger(__name__) MOTLEY_REPLACEMENT = StringReplacement("�", "") ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement( "#!", "?_escaped_fragment_=") TITLE_REPLACEMENTS = StringReplacement("»", "»") PIPE_SPLITTER = StringSplitter("\\|") DASH_SPLITTER = StringSplitter(" - ") UNDERSCORE_SPLITTER = StringSplitter("_") SLASH_SPLITTER = StringSplitter("/") ARROWS_SPLITTER = StringSplitter(" » ") COLON_SPLITTER = StringSplitter(":") SPACE_SPLITTER = StringSplitter(' ') NO_STRINGS = set() A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = ("a[href*='/tag/'], a[href*='/tags/'], " "a[href*='/topic/'], a[href*='?keyword=']") RE_LANG = r'^[A-Za-z]{2}$' good_paths = ['story', 'article', 'feature', 'featured', 'slides', 'slideshow', 'gallery', 'news', 'video', 'media', 'v', 'radio', 'press'] bad_chunks = ['careers', 'contact', 'about', 'faq', 'terms', 'privacy', 'advert', 'preferences', 'feedback', 'info', 'browse', 'howto', 'account', 'subscribe', 'donate', 'shop', 'admin'] bad_domains = ['amazon', 'doubleclick', 'twitter'] class ContentExtractor(object): def __init__(self, config): self.config = config self.parser = self.config.get_parser() self.language = config.language self.stopwords_class = config.stopwords_class def update_language(self, meta_lang): """Required to be called before the extraction process in some cases because the stopwords_class has to set incase the lang is not latin based """ if meta_lang: self.language = meta_lang self.stopwords_class = \ self.config.get_stopwords_class(meta_lang) def get_authors(self, doc): """Fetch the authors of the article, return as a list Only works for english articles """ _digits = re.compile('\d') def contains_digits(d): return bool(_digits.search(d)) def uniqify_list(lst): """Remove duplicates from provided list but maintain original order. Derived from http://www.peterbe.com/plog/uniqifiers-benchmark """ seen = {} result = [] for item in lst: if item.lower() in seen: continue seen[item.lower()] = 1 result.append(item.title()) return result def parse_byline(search_str): """ Takes a candidate line of html or text and extracts out the name(s) in list form: >>> parse_byline('
current_node_tag = self.parser.getTag(current_node) if current_node_tag == para: if steps_away >= max_stepsaway_from_node: return False paragraph_text = self.parser.getText(current_node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(paragraph_text) if word_stats.get_stopword_count() > minimum_stopword_count: return True steps_away += 1 return False def walk_siblings(self, node): return self.parser.previousSiblings(node) def add_siblings(self, top_node): baseline_score_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) for current_node in results: ps = self.get_siblings_content( current_node, baseline_score_siblings_para) for p in ps: top_node.insert(0, p) return top_node def get_siblings_content( self, current_sibling, baseline_score_siblings_para): """Adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' and \ len(self.parser.getText(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = copy.deepcopy(e0) e0.tail = '' return [e0] else: potential_paragraphs = self.parser.getElementsByTag( current_sibling, tag='p') if potential_paragraphs is None: return None else: ps = [] for first_paragraph in potential_paragraphs: text = self.parser.getText(first_paragraph) if len(text) > 0: word_stats = self.stopwords_class( language=self.language). \ get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density( first_paragraph) score = float(baseline_score_siblings_para * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = self.parser.createElement( tag='p', text=text, tail=None) ps.append(p) return ps def get_siblings_score(self, top_node): """We could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 paragraphs_number = 0 paragraphs_score = 0 nodes_to_check = self.parser.getElementsByTag(top_node, tag='p') for node in nodes_to_check: text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: paragraphs_number += 1 paragraphs_score += word_stats.get_stopword_count() if paragraphs_number > 0: base = paragraphs_score / paragraphs_number return base def update_score(self, node, add_to_score): """Adds a score to the gravityScore Attribute we put on divs we'll get the current score then add the score we're passing in to the current. """ current_score = 0 score_string = self.parser.getAttribute(node, 'gravityScore') if score_string: current_score = float(score_string) new_score = current_score + add_to_score self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): """Stores how many decent nodes are under a parent node """ current_score = 0 count_string = self.parser.getAttribute(node, 'gravityNodes') if count_string: current_score = int(count_string) new_score = current_score + add_to_count self.parser.setAttribute(node, "gravityNodes", str(new_score)) def is_highlink_density(self, e): """Checks the density of links within a node, if there is a high link to text ratio, then the text is less likely to be relevant """ links = self.parser.getElementsByTag(e, tag='a') if not links: return False text = self.parser.getText(e) words = [word for word in text.split() if word.isalnum()] if not words: return True words_number = float(len(words)) sb = [] for link in links: sb.append(self.parser.getText(link)) link_text = ''.join(sb) link_words = link_text.split() num_link_words = float(len(link_words)) num_links = float(len(links)) link_divisor = float(num_link_words / words_number) score = float(link_divisor * num_links) if score >= 1.0: return True return False # return True if score > 1.0 else False def get_score(self, node): """Returns the gravityScore as an integer from this node """ return self.get_node_gravity_score(node) or 0 def get_node_gravity_score(self, node): gravity_score = self.parser.getAttribute(node, 'gravityScore') if not gravity_score: return None return float(gravity_score) def nodes_to_check(self, doc): """Returns a list of nodes we want to search on like paragraphs and tables """ nodes_to_check = [] for tag in ['p', 'pre', 'td']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items return nodes_to_check def is_table_and_no_para_exist(self, e): sub_paragraphs = self.parser.getElementsByTag(e, tag='p') for p in sub_paragraphs: txt = self.parser.getText(p) if len(txt) < 25: self.parser.remove(p) sub_paragraphs_2 = self.parser.getElementsByTag(e, tag='p') if len(sub_paragraphs_2) == 0 and e.tag != "td": return True return False def is_nodescore_threshold_met(self, node, e): top_node_score = self.get_score(node) current_node_score = self.get_score(e) threshold = float(top_node_score * .08) if (current_node_score < threshold) and e.tag != 'td': return False return True def post_cleanup(self, top_node): """Remove any divs that looks like non-content, clusters of links, or paras with no gusto; add adjacent nodes which look contenty """ node = self.add_siblings(top_node) for e in self.parser.getChildren(node): e_tag = self.parser.getTag(e) if e_tag != 'p': if self.is_highlink_density(e): self.parser.remove(e) return node