Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: respect user configuration for work with status codes #812

Merged
merged 4 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/crawlee/_utils/http.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from __future__ import annotations


def is_status_code_error(value: int) -> bool:
"""Returns `True` for 4xx or 5xx status codes, `False` otherwise."""
return is_status_code_client_error(value) or is_status_code_server_error(value)


def is_status_code_client_error(value: int) -> bool:
"""Returns `True` for 4xx status codes, `False` otherwise."""
return 400 <= value <= 499 # noqa: PLR2004
Expand Down
11 changes: 4 additions & 7 deletions src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,10 @@ async def _handle_blocked_request(
"""
if self._retry_on_blocked:
status_code = context.http_response.status_code

# TODO: refactor to avoid private member access
# https://github.com/apify/crawlee-python/issues/708
if (
context.session
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
and context.session.is_blocked_status_code(status_code=status_code)
if context.session and context.session.is_blocked_status_code(
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
status_code=status_code,
additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,
ignore_http_error_status_codes=self._http_client.ignore_http_error_status_codes,
):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
if blocked_info := self._parser.is_blocked(context.parsed_content):
Expand Down
5 changes: 2 additions & 3 deletions src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.docs import docs_group
from crawlee._utils.http import is_status_code_client_error
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee._utils.wait import wait_for
from crawlee.basic_crawler._context_pipeline import ContextPipeline
from crawlee.errors import (
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
HttpStatusCodeError,
HttpClientStatusCodeError,
RequestHandlerError,
SessionError,
UserDefinedErrorHandlerError,
Expand Down Expand Up @@ -670,7 +669,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
return False

# Do not retry on client errors.
if isinstance(error, HttpStatusCodeError) and is_status_code_client_error(error.status_code):
if isinstance(error, HttpClientStatusCodeError):
return False

if isinstance(error, SessionError):
Expand Down
6 changes: 6 additions & 0 deletions src/crawlee/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
'ContextPipelineFinalizationError',
'ContextPipelineInitializationError',
'ContextPipelineInterruptedError',
'HttpClientStatusCodeError',
'HttpStatusCodeError',
'ProxyError',
'RequestHandlerError',
Expand Down Expand Up @@ -50,6 +51,11 @@ def __init__(self, message: str, status_code: int) -> None:
self.message = message


@docs_group('Errors')
class HttpClientStatusCodeError(HttpStatusCodeError):
"""Raised when the response status code indicates an client error."""


@docs_group('Errors')
class RequestHandlerError(Exception, Generic[TCrawlingContext]):
"""Wraps an exception thrown from a request handler (router) and extends it with crawling context."""
Expand Down
21 changes: 16 additions & 5 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing import TYPE_CHECKING, Protocol

from crawlee._utils.docs import docs_group
from crawlee._utils.http import is_status_code_error
from crawlee.errors import HttpStatusCodeError
from crawlee._utils.http import is_status_code_client_error, is_status_code_server_error
from crawlee.errors import HttpClientStatusCodeError, HttpStatusCodeError

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -150,8 +150,19 @@ def _raise_for_error_status_code(
exclude_error = status_code in ignore_http_error_status_codes
include_error = status_code in additional_http_error_status_codes

if include_error or (is_status_code_error(status_code) and not exclude_error):
if include_error:
raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)
if include_error:
raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)

if is_status_code_client_error(status_code) and not exclude_error:
raise HttpClientStatusCodeError('Client error status code returned', status_code)

if is_status_code_server_error(status_code) and not exclude_error:
raise HttpStatusCodeError('Error status code returned', status_code)
vdusek marked this conversation as resolved.
Show resolved Hide resolved

@property
def additional_blocked_status_codes(self) -> set[int]:
return self._additional_http_error_status_codes

@property
def ignore_http_error_status_codes(self) -> set[int]:
return self._ignore_http_error_status_codes
11 changes: 8 additions & 3 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
self._max_usage_count = max_usage_count
self._error_score = error_score
self._cookies = cookies or {}
self._blocked_status_codes = blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES
self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES)

@classmethod
def from_model(cls, model: SessionModel) -> Session:
Expand Down Expand Up @@ -193,17 +193,22 @@ def is_blocked_status_code(
self,
*,
status_code: int,
additional_blocked_status_codes: list[int] | None = None,
additional_blocked_status_codes: set[int] | None = None,
ignore_http_error_status_codes: set[int] | None = None,
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
) -> bool:
"""Evaluate whether a session should be retired based on the received HTTP status code.

Args:
status_code: The HTTP status code received from a server response.
additional_blocked_status_codes: Optional additional status codes that should trigger session retirement.
ignore_http_error_status_codes: Optional status codes to allow suppression of
codes from `blocked_status_codes`.

Returns:
True if the session should be retired, False otherwise.
"""
blocked_status_codes = self._blocked_status_codes + (additional_blocked_status_codes or [])
blocked_status_codes = self._blocked_status_codes | (additional_blocked_status_codes or set())

blocked_status_codes = blocked_status_codes - (ignore_http_error_status_codes or set())
janbuchar marked this conversation as resolved.
Show resolved Hide resolved

return status_code in blocked_status_codes
Loading