Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass optional arguments to requests.get #282

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions newsplease/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,26 @@ def from_html(html, url=None, download_date=None, fetch_images=True):
return final_article

@staticmethod
def from_url(url, timeout=None, user_agent=None):
def from_url(url, request_args=None):
"""
Crawls the article from the url and extracts relevant information.
:param url:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: A NewsArticle object containing all the information of the article. Else, None.
:rtype: NewsArticle, None
"""
articles = NewsPlease.from_urls([url], timeout=timeout, user_agent=user_agent)
articles = NewsPlease.from_urls([url], request_args=request_args)
if url in articles.keys():
return articles[url]
else:
return None

@staticmethod
def from_urls(urls, timeout=None, user_agent=None):
def from_urls(urls, request_args=None):
"""
Crawls articles from the urls and extracts relevant information.
:param urls:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
"""
results = {}
Expand All @@ -142,10 +142,10 @@ def from_urls(urls, timeout=None, user_agent=None):
pass
elif len(urls) == 1:
url = urls[0]
html = SimpleCrawler.fetch_url(url, timeout=timeout, user_agent=user_agent)
html = SimpleCrawler.fetch_url(url, request_args=request_args)
results[url] = NewsPlease.from_html(html, url, download_date)
else:
results = SimpleCrawler.fetch_urls(urls, timeout=timeout, user_agent=user_agent)
results = SimpleCrawler.fetch_urls(urls, request_args=request_args)

futures = {}
with cf.ProcessPoolExecutor() as exec:
Expand All @@ -158,7 +158,7 @@ def from_urls(urls, timeout=None, user_agent=None):
for future in cf.as_completed(futures):
url = futures[future]
try:
results[url] = future.result(timeout=timeout)
results[url] = future.result(timeout=request_args.get("timeout"))
except Exception as err:
results[url] = {}

Expand Down
30 changes: 13 additions & 17 deletions newsplease/crawler/simple_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,40 +28,36 @@ class SimpleCrawler(object):
_results = {}

@staticmethod
def fetch_url(url, timeout=None, user_agent=USER_AGENT):
def fetch_url(url, request_args=None):
"""
Crawls the html content of the parameter url and returns the html
:param url:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return:
"""
return SimpleCrawler._fetch_url(url, False, timeout=timeout, user_agent=user_agent)
return SimpleCrawler._fetch_url(url, False, request_args=request_args)

@staticmethod
def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
def _fetch_url(url, is_threaded, request_args=None):
"""
Crawls the html content of the parameter url and saves the html in _results
:param url:
:param is_threaded: If True, results will be stored for later processing by the fetch_urls method. Else not.
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return: html of the url
"""
headers = HEADERS
if user_agent:
headers["User-Agent"] = user_agent
if request_args is None:
request_args = {}
if "headers" not in request_args:
request_args["headers"] = HEADERS

html_str = None
# send
try:
# read by streaming chunks (stream=True, iter_content=xx)
# so we can stop downloading as soon as MAX_FILE_SIZE is reached
response = requests.get(
url,
timeout=timeout,
verify=False,
allow_redirects=True,
headers=headers,
)
url, verify=False, allow_redirects=True, **request_args)
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
LOGGER.error("malformed URL: %s", url)
except requests.exceptions.TooManyRedirects:
Expand Down Expand Up @@ -91,15 +87,15 @@ def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
return html_str

@staticmethod
def fetch_urls(urls, timeout=None, user_agent=USER_AGENT):
def fetch_urls(urls, request_args=None):
"""
Crawls the html content of all given urls in parallel. Returns when all requests are processed.
:param urls:
:param timeout: in seconds, if None, the urllib default is used
:param request_args: optional arguments that `request` takes
:return:
"""
threads = [
threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout, user_agent))
threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, request_args))
for url in urls
]
for thread in threads:
Expand Down