# -*- coding: utf-8 -*- """ Source objects abstract online news source websites & domains. www.cnn.com would be its own source. """ __title__ = 'newspaper' __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import logging from urllib.parse import urljoin, urlsplit, urlunsplit from tldextract import tldextract from . import network from . import urls from . import utils from .article import Article from .configuration import Configuration from .extractors import ContentExtractor from .settings import ANCHOR_DIRECTORY log = logging.getLogger(__name__) class Category(object): def __init__(self, url): self.url = url self.html = None self.doc = None class Feed(object): def __init__(self, url): self.url = url self.rss = None # TODO self.dom = None, speed up Feedparser NUM_THREADS_PER_SOURCE_WARN_LIMIT = 5 class Source(object): """Sources are abstractions of online news vendors like huffpost or cnn. domain = 'www.cnn.com' scheme = 'http' categories = ['http://cnn.com/world', 'http://money.cnn.com'] feeds = ['http://cnn.com/rss.atom', ..] articles = [
,
, ..] brand = 'cnn' """ def __init__(self, url, config=None, **kwargs): """The config object for this source will be passed into all of this source's children articles unless specified otherwise or re-set. """ if (url is None) or ('://' not in url) or (url[:4] != 'http'): raise Exception('Input url is bad!') self.config = config or Configuration() self.config = utils.extend_config(self.config, kwargs) self.extractor = ContentExtractor(self.config) self.url = url self.url = urls.prepare_url(url) self.domain = urls.get_domain(self.url) self.scheme = urls.get_scheme(self.url) self.categories = [] self.feeds = [] self.articles = [] self.html = '' self.doc = None self.logo_url = '' self.favicon = '' self.brand = tldextract.extract(self.url).domain self.description = '' self.is_parsed = False self.is_downloaded = False def build(self): """Encapsulates download and basic parsing with lxml. May be a good idea to split this into download() and parse() methods. """ self.download() self.parse() self.set_categories() self.download_categories() # mthread self.parse_categories() self.set_feeds() self.download_feeds() # mthread # self.parse_feeds() self.generate_articles() def purge_articles(self, reason, articles): """Delete rejected articles, if there is an articles param, purge from there, otherwise purge from source instance. Reference this StackOverflow post for some of the wonky syntax below: http://stackoverflow.com/questions/1207406/remove-items-from-a- list-while-iterating-in-python """ if reason == 'url': articles[:] = [a for a in articles if a.is_valid_url()] elif reason == 'body': articles[:] = [a for a in articles if a.is_valid_body()] return articles @utils.cache_disk(seconds=(86400 * 1), cache_folder=ANCHOR_DIRECTORY) def _get_category_urls(self, domain): """The domain param is **necessary**, see .utils.cache_disk for reasons. the boilerplate method is so we can use this decorator right. We are caching categories for 1 day. """ return self.extractor.get_category_urls(self.url, self.doc) def set_categories(self): urls = self._get_category_urls(self.domain) self.categories = [Category(url=url) for url in urls] def set_feeds(self): """Don't need to cache getting feed urls, it's almost instant with xpath """ common_feed_urls = ['/feed', '/feeds', '/rss'] common_feed_urls = [urljoin(self.url, url) for url in common_feed_urls] split = urlsplit(self.url) if split.netloc in ('medium.com', 'www.medium.com'): # should handle URL to user or user's post if split.path.startswith('/@'): new_path = '/feed/' + split.path.split('/')[1] new_parts = split.scheme, split.netloc, new_path, '', '' common_feed_urls.append(urlunsplit(new_parts)) common_feed_urls_as_categories = [Category(url=url) for url in common_feed_urls] category_urls = [c.url for c in common_feed_urls_as_categories] requests = network.multithread_request(category_urls, self.config) for index, _ in enumerate(common_feed_urls_as_categories): response = requests[index].resp if response and response.ok: common_feed_urls_as_categories[index].html = network.get_html( response.url, response=response) common_feed_urls_as_categories = [c for c in common_feed_urls_as_categories if c.html] for _ in common_feed_urls_as_categories: doc = self.config.get_parser().fromstring(_.html) _.doc = doc common_feed_urls_as_categories = [c for c in common_feed_urls_as_categories if c.doc is not None] categories_and_common_feed_urls = self.categories + common_feed_urls_as_categories urls = self.extractor.get_feed_urls(self.url, categories_and_common_feed_urls) self.feeds = [Feed(url=url) for url in urls] def set_description(self): """Sets a blurb for this source, for now we just query the desc html attribute """ desc = self.extractor.get_meta_description(self.doc) self.description = desc def download(self): """Downloads html of source """ self.html = network.get_html(self.url, self.config) def download_categories(self): """Download all category html, can use mthreading """ category_urls = [c.url for c in self.categories] requests = network.multithread_request(category_urls, self.config) for index, _ in enumerate(self.categories): req = requests[index] if req.resp is not None: self.categories[index].html = network.get_html( req.url, response=req.resp) else: log.warning(('Deleting category %s from source %s due to ' 'download error') % (self.categories[index].url, self.url)) self.categories = [c for c in self.categories if c.html] def download_feeds(self): """Download all feed html, can use mthreading """ feed_urls = [f.url for f in self.feeds] requests = network.multithread_request(feed_urls, self.config) for index, _ in enumerate(self.feeds): req = requests[index] if req.resp is not None: self.feeds[index].rss = network.get_html( req.url, response=req.resp) else: log.warning(('Deleting feed %s from source %s due to ' 'download error') % (self.categories[index].url, self.url)) self.feeds = [f for f in self.feeds if f.rss] def parse(self): """Sets the lxml root, also sets lxml roots of all children links, also sets description """ # TODO: This is a terrible idea, ill try to fix it when i'm more rested self.doc = self.config.get_parser().fromstring(self.html) if self.doc is None: log.warning('Source %s parse error.' % self.url) return self.set_description() def parse_categories(self): """Parse out the lxml root in each category """ log.debug('We are extracting from %d categories' % len(self.categories)) for category in self.categories: doc = self.config.get_parser().fromstring(category.html) category.doc = doc self.categories = [c for c in self.categories if c.doc is not None] def _map_title_to_feed(self, feed): doc = self.config.get_parser().fromstring(feed.rss) if doc is None: # http://stackoverflow.com/a/24893800 return None elements = self.config.get_parser().getElementsByTag(doc, tag='title') feed.title = next((element.text for element in elements if element.text), self.brand) return feed def parse_feeds(self): """Add titles to feeds """ log.debug('We are parsing %d feeds' % len(self.feeds)) self.feeds = [self._map_title_to_feed(f) for f in self.feeds] def feeds_to_articles(self): """Returns articles given the url of a feed """ articles = [] for feed in self.feeds: urls = self.extractor.get_urls(feed.rss, regex=True) cur_articles = [] before_purge = len(urls) for url in urls: article = Article( url=url, source_url=feed.url, config=self.config) cur_articles.append(article) cur_articles = self.purge_articles('url', cur_articles) after_purge = len(cur_articles) if self.config.memoize_articles: cur_articles = utils.memoize_articles(self, cur_articles) after_memo = len(cur_articles) articles.extend(cur_articles) log.debug('%d->%d->%d for %s' % (before_purge, after_purge, after_memo, feed.url)) return articles def categories_to_articles(self): """Takes the categories, splays them into a big list of urls and churns the articles out of each url with the url_to_article method """ articles = [] for category in self.categories: cur_articles = [] url_title_tups = self.extractor.get_urls(category.doc, titles=True) before_purge = len(url_title_tups) for tup in url_title_tups: indiv_url = tup[0] indiv_title = tup[1] _article = Article( url=indiv_url, source_url=category.url, title=indiv_title, config=self.config ) cur_articles.append(_article) cur_articles = self.purge_articles('url', cur_articles) after_purge = len(cur_articles) if self.config.memoize_articles: cur_articles = utils.memoize_articles(self, cur_articles) after_memo = len(cur_articles) articles.extend(cur_articles) log.debug('%d->%d->%d for %s' % (before_purge, after_purge, after_memo, category.url)) return articles def _generate_articles(self): """Returns a list of all articles, from both categories and feeds """ category_articles = self.categories_to_articles() feed_articles = self.feeds_to_articles() articles = feed_articles + category_articles uniq = {article.url: article for article in articles} return list(uniq.values()) def generate_articles(self, limit=5000): """Saves all current articles of news source, filter out bad urls """ articles = self._generate_articles() self.articles = articles[:limit] log.debug('%d articles generated and cutoff at %d', len(articles), limit) def download_articles(self, threads=1): """Downloads all articles attached to self """ # TODO fix how the article's is_downloaded is not set! urls = [a.url for a in self.articles] failed_articles = [] if threads == 1: for index, article in enumerate(self.articles): url = urls[index] html = network.get_html(url, config=self.config) self.articles[index].set_html(html) if not html: failed_articles.append(self.articles[index]) self.articles = [a for a in self.articles if a.html] else: if threads > NUM_THREADS_PER_SOURCE_WARN_LIMIT: log.warning(('Using %s+ threads on a single source ' 'may result in rate limiting!') % NUM_THREADS_PER_SOURCE_WARN_LIMIT) filled_requests = network.multithread_request(urls, self.config) # Note that the responses are returned in original order for index, req in enumerate(filled_requests): html = network.get_html(req.url, response=req.resp) self.articles[index].set_html(html) if not req.resp: failed_articles.append(self.articles[index]) self.articles = [a for a in self.articles if a.html] self.is_downloaded = True if len(failed_articles) > 0: log.warning('The following article urls failed the download: %s' % ', '.join([a.url for a in failed_articles])) def parse_articles(self): """Parse all articles, delete if too small """ for index, article in enumerate(self.articles): article.parse() self.articles = self.purge_articles('body', self.articles) self.is_parsed = True def size(self): """Number of articles linked to this news source """ if self.articles is None: return 0 return len(self.articles) def clean_memo_cache(self): """Clears the memoization cache for this specific news domain """ utils.clear_memo_cache(self) def feed_urls(self): """Returns a list of feed urls """ return [feed.url for feed in self.feeds] def category_urls(self): """Returns a list of category urls """ return [category.url for category in self.categories] def article_urls(self): """Returns a list of article urls """ return [article.url for article in self.articles] def print_summary(self): """Prints out a summary of the data in our source instance """ print('[source url]:', self.url) print('[source brand]:', self.brand) print('[source domain]:', self.domain) print('[source len(articles)]:', len(self.articles)) print('[source description[:50]]:', self.description[:50]) print('printing out 10 sample articles...') for a in self.articles[:10]: print('\t', '[url]:', a.url) print('\t[title]:', a.title) print('\t[len of text]:', len(a.text)) print('\t[keywords]:', a.keywords) print('\t[len of html]:', len(a.html)) print('\t==============') print('feed_urls:', self.feed_urls()) print('\r\n') print('category_urls:', self.category_urls())