Skip to content

Commit

Permalink
Merge pull request #282 from jkawamoto/kwargs
Browse files Browse the repository at this point in the history
Pass optional arguments to requests.get
  • Loading branch information
fhamborg authored Sep 4, 2024
2 parents 758c008 + 30e62b7 commit c4c3d0a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 25 deletions.
16 changes: 8 additions & 8 deletions newsplease/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,26 @@ def from_html(html, url=None, download_date=None, fetch_images=True):
return final_article

@staticmethod
def from_url(url, timeout=None, user_agent=None):
def from_url(url, request_args=None):
"""
Crawls the article from the url and extracts relevant information.
:param url:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: A NewsArticle object containing all the information of the article. Else, None.
:rtype: NewsArticle, None
"""
articles = NewsPlease.from_urls([url], timeout=timeout, user_agent=user_agent)
articles = NewsPlease.from_urls([url], request_args=request_args)
if url in articles.keys():
return articles[url]
else:
return None

@staticmethod
def from_urls(urls, timeout=None, user_agent=None):
def from_urls(urls, request_args=None):
"""
Crawls articles from the urls and extracts relevant information.
:param urls:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
"""
results = {}
Expand All @@ -142,10 +142,10 @@ def from_urls(urls, timeout=None, user_agent=None):
pass
elif len(urls) == 1:
url = urls[0]
html = SimpleCrawler.fetch_url(url, timeout=timeout, user_agent=user_agent)
html = SimpleCrawler.fetch_url(url, request_args=request_args)
results[url] = NewsPlease.from_html(html, url, download_date)
else:
results = SimpleCrawler.fetch_urls(urls, timeout=timeout, user_agent=user_agent)
results = SimpleCrawler.fetch_urls(urls, request_args=request_args)

futures = {}
with cf.ProcessPoolExecutor() as exec:
Expand All @@ -158,7 +158,7 @@ def from_urls(urls, timeout=None, user_agent=None):
for future in cf.as_completed(futures):
url = futures[future]
try:
results[url] = future.result(timeout=timeout)
results[url] = future.result(timeout=request_args.get("timeout"))
except Exception as err:
results[url] = {}

Expand Down
30 changes: 13 additions & 17 deletions newsplease/crawler/simple_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,40 +28,36 @@ class SimpleCrawler(object):
_results = {}

@staticmethod
def fetch_url(url, timeout=None, user_agent=USER_AGENT):
def fetch_url(url, request_args=None):
"""
Crawls the html content of the parameter url and returns the html
:param url:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return:
"""
return SimpleCrawler._fetch_url(url, False, timeout=timeout, user_agent=user_agent)
return SimpleCrawler._fetch_url(url, False, request_args=request_args)

@staticmethod
def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
def _fetch_url(url, is_threaded, request_args=None):
"""
Crawls the html content of the parameter url and saves the html in _results
:param url:
:param is_threaded: If True, results will be stored for later processing by the fetch_urls method. Else not.
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: html of the url
"""
headers = HEADERS
if user_agent:
headers["User-Agent"] = user_agent
if request_args is None:
request_args = {}
if "headers" not in request_args:
request_args["headers"] = HEADERS

html_str = None
# send
try:
# read by streaming chunks (stream=True, iter_content=xx)
# so we can stop downloading as soon as MAX_FILE_SIZE is reached
response = requests.get(
url,
timeout=timeout,
verify=False,
allow_redirects=True,
headers=headers,
)
url, verify=False, allow_redirects=True, **request_args)
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
LOGGER.error("malformed URL: %s", url)
except requests.exceptions.TooManyRedirects:
Expand Down Expand Up @@ -91,15 +87,15 @@ def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
return html_str

@staticmethod
def fetch_urls(urls, timeout=None, user_agent=USER_AGENT):
def fetch_urls(urls, request_args=None):
"""
Crawls the html content of all given urls in parallel. Returns when all requests are processed.
:param urls:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return:
"""
threads = [
threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout, user_agent))
threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, request_args))
for url in urls
]
for thread in threads:
Expand Down

0 comments on commit c4c3d0a

Please sign in to comment.