diff --git a/newsplease/__init__.py b/newsplease/__init__.py index 232629a8..3f69b381 100644 --- a/newsplease/__init__.py +++ b/newsplease/__init__.py @@ -111,26 +111,26 @@ def from_html(html, url=None, download_date=None, fetch_images=True): return final_article @staticmethod - def from_url(url, timeout=None, user_agent=None): + def from_url(url, request_args=None): """ Crawls the article from the url and extracts relevant information. :param url: - :param timeout: in seconds, if None, the urllib default is used + :param request_args: optional arguments that `request` takes :return: A NewsArticle object containing all the information of the article. Else, None. :rtype: NewsArticle, None """ - articles = NewsPlease.from_urls([url], timeout=timeout, user_agent=user_agent) + articles = NewsPlease.from_urls([url], request_args=request_args) if url in articles.keys(): return articles[url] else: return None @staticmethod - def from_urls(urls, timeout=None, user_agent=None): + def from_urls(urls, request_args=None): """ Crawls articles from the urls and extracts relevant information. :param urls: - :param timeout: in seconds, if None, the urllib default is used + :param request_args: optional arguments that `request` takes :return: A dict containing given URLs as keys, and extracted information as corresponding values. """ results = {} @@ -142,10 +142,10 @@ def from_urls(urls, timeout=None, user_agent=None): pass elif len(urls) == 1: url = urls[0] - html = SimpleCrawler.fetch_url(url, timeout=timeout, user_agent=user_agent) + html = SimpleCrawler.fetch_url(url, request_args=request_args) results[url] = NewsPlease.from_html(html, url, download_date) else: - results = SimpleCrawler.fetch_urls(urls, timeout=timeout, user_agent=user_agent) + results = SimpleCrawler.fetch_urls(urls, request_args=request_args) futures = {} with cf.ProcessPoolExecutor() as exec: @@ -158,7 +158,7 @@ def from_urls(urls, timeout=None, user_agent=None): for future in cf.as_completed(futures): url = futures[future] try: - results[url] = future.result(timeout=timeout) + results[url] = future.result(timeout=request_args.get("timeout")) except Exception as err: results[url] = {} diff --git a/newsplease/crawler/simple_crawler.py b/newsplease/crawler/simple_crawler.py index f0e724bb..e5bf35b1 100644 --- a/newsplease/crawler/simple_crawler.py +++ b/newsplease/crawler/simple_crawler.py @@ -28,27 +28,28 @@ class SimpleCrawler(object): _results = {} @staticmethod - def fetch_url(url, timeout=None, user_agent=USER_AGENT): + def fetch_url(url, request_args=None): """ Crawls the html content of the parameter url and returns the html :param url: - :param timeout: in seconds, if None, the urllib default is used + :param request_args: optional arguments that `request` takes :return: """ - return SimpleCrawler._fetch_url(url, False, timeout=timeout, user_agent=user_agent) + return SimpleCrawler._fetch_url(url, False, request_args=request_args) @staticmethod - def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT): + def _fetch_url(url, is_threaded, request_args=None): """ Crawls the html content of the parameter url and saves the html in _results :param url: :param is_threaded: If True, results will be stored for later processing by the fetch_urls method. Else not. - :param timeout: in seconds, if None, the urllib default is used + :param request_args: optional arguments that `request` takes :return: html of the url """ - headers = HEADERS - if user_agent: - headers["User-Agent"] = user_agent + if request_args is None: + request_args = {} + if "headers" not in request_args: + request_args["headers"] = HEADERS html_str = None # send @@ -56,12 +57,7 @@ def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT): # read by streaming chunks (stream=True, iter_content=xx) # so we can stop downloading as soon as MAX_FILE_SIZE is reached response = requests.get( - url, - timeout=timeout, - verify=False, - allow_redirects=True, - headers=headers, - ) + url, verify=False, allow_redirects=True, **request_args) except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL): LOGGER.error("malformed URL: %s", url) except requests.exceptions.TooManyRedirects: @@ -91,15 +87,15 @@ def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT): return html_str @staticmethod - def fetch_urls(urls, timeout=None, user_agent=USER_AGENT): + def fetch_urls(urls, request_args=None): """ Crawls the html content of all given urls in parallel. Returns when all requests are processed. :param urls: - :param timeout: in seconds, if None, the urllib default is used + :param request_args: optional arguments that `request` takes :return: """ threads = [ - threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout, user_agent)) + threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, request_args)) for url in urls ] for thread in threads: