From daef987fbfc751ffd2161810aca70cf727896e82 Mon Sep 17 00:00:00 2001 From: Robert Szefler Date: Sat, 24 Feb 2024 10:21:56 +0100 Subject: [PATCH 1/7] fix playbook name: WeeklyPopeyScan -> WeeklyPopeyeScan (#1308) --- helm/robusta/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index e37a50e01..6f1cc3428 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -459,7 +459,7 @@ platformPlaybooks: sinks: - "robusta_ui_sink" -- name: "WeeklyPopeyScan" +- name: "WeeklyPopeyeScan" triggers: - on_schedule: fixed_delay_repeat: From 383adcfbd5a9c847192824ef00082bdd180f6cca Mon Sep 17 00:00:00 2001 From: Ganesh Rathinavel Medayil <182092+ganeshrvel@users.noreply.github.com> Date: Sat, 24 Feb 2024 14:54:40 +0530 Subject: [PATCH 2/7] Removed RSA pair (#1307) --- .gitignore | 1 + helm/robusta/templates/NOTES.txt | 8 ++++++-- helm/robusta/templates/auth-config.yaml | 16 ---------------- helm/robusta/values.yaml | 5 ----- src/robusta/api/__init__.py | 1 - src/robusta/cli/main.py | 7 ------- src/robusta/utils/auth_provider.py | 6 +++--- 7 files changed, 10 insertions(+), 34 deletions(-) delete mode 100644 helm/robusta/templates/auth-config.yaml diff --git a/.gitignore b/.gitignore index 15122646b..1073ffbd2 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ generated_values*.yaml skaffold.dev.yaml tests/last_used_tag.txt pytest.ini +gen-config-test diff --git a/helm/robusta/templates/NOTES.txt b/helm/robusta/templates/NOTES.txt index b8ab041cd..13d78e69d 100644 --- a/helm/robusta/templates/NOTES.txt +++ b/helm/robusta/templates/NOTES.txt @@ -5,7 +5,7 @@ This data is extremely limited and contains only general metadata to help us und If you are willing to share additional data, please do so! It really help us improve Robusta. You can set sendAdditionalTelemetry: true as a Helm value to send exception reports and additional data. -This is disabled by default. +This is disabled by default. To opt-out of telemetry entirely, set a ENABLE_TELEMETRY=false environment variable on the robusta-runner deployment. @@ -13,4 +13,8 @@ To opt-out of telemetry entirely, set a ENABLE_TELEMETRY=false environment varia {{if .robusta_sink}} Visit the web UI at: https://platform.robusta.dev/ {{- end }} -{{- end }} \ No newline at end of file +{{- end }} + +{{- if .Values.rsa }} +NOTICE: RSA is no longer used by default and can be removed from the values file +{{- end }} diff --git a/helm/robusta/templates/auth-config.yaml b/helm/robusta/templates/auth-config.yaml deleted file mode 100644 index 22493ad87..000000000 --- a/helm/robusta/templates/auth-config.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if and .Values.rsa (not .Values.rsa.existingSecret) }} -apiVersion: v1 -kind: Secret -metadata: - name: robusta-auth-config-secret - namespace: {{ .Release.Namespace }} -type: Opaque -data: -{{- if and .Values.rsa.public .Values.rsa.private }} - prv: {{ .Values.rsa.private }} - pub: {{ .Values.rsa.public }} -{{- else }} - prv: {{ .Values.rsa.prv | b64enc }} - pub: {{ .Values.rsa.pub | b64enc }} -{{- end }} -{{- end }} diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index 6f1cc3428..ca9842498 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -692,11 +692,6 @@ kube-prometheus-stack: limits: memory: 256Mi -rsa: ~ - # @param existingSecret Name of existing secret containing the rsa keys - # NOTE: Must contain the keys `pub` and `prv` - # existingSecret: my-robusta-rsa-keys - # custom parameters for OpenShift clusters openshift: enabled: false diff --git a/src/robusta/api/__init__.py b/src/robusta/api/__init__.py index 4bb462bd5..1ce40a73e 100644 --- a/src/robusta/api/__init__.py +++ b/src/robusta/api/__init__.py @@ -70,7 +70,6 @@ ROBUSTA_LOGO_URL, ROBUSTA_TELEMETRY_ENDPOINT, ROBUSTA_UI_DOMAIN, - RSA_KEYS_PATH, RUNNER_VERSION, SEND_ADDITIONAL_TELEMETRY, SERVICE_CACHE_MAX_SIZE, diff --git a/src/robusta/cli/main.py b/src/robusta/cli/main.py index 2279281a3..4c1d34037 100755 --- a/src/robusta/cli/main.py +++ b/src/robusta/cli/main.py @@ -16,9 +16,7 @@ from pydantic import BaseModel, Extra from robusta._version import __version__ -from robusta.cli.auth import RSAKeyPair from robusta.cli.auth import app as auth_commands -from robusta.cli.auth import gen_rsa_pair from robusta.cli.backend_profile import backend_profile from robusta.cli.eula import handle_eula from robusta.cli.integrations_cmd import app as integrations_commands @@ -88,7 +86,6 @@ class HelmValues(BaseModel, extra=Extra.allow): kubewatch: Dict = None grafanaRenderer: Dict = None runner: Dict = None - rsa: RSAKeyPair = None def get_slack_channel() -> str: @@ -266,7 +263,6 @@ def gen_config( enablePrometheusStack=enable_prometheus_stack, disableCloudRouting=disable_cloud_routing, enablePlatformPlaybooks=enable_platform_playbooks, - rsa=gen_rsa_pair(), ) values.runner = {} @@ -327,9 +323,6 @@ def update_config( """ with open(existing_values, "r") as existing_values_file: values: HelmValues = HelmValues(**yaml.safe_load(existing_values_file)) - if not values.rsa: - typer.secho("Generating RSA key-pair", fg="green") - values.rsa = gen_rsa_pair() if not values.globalConfig.signing_key: typer.secho("Generating signing key", fg="green") diff --git a/src/robusta/utils/auth_provider.py b/src/robusta/utils/auth_provider.py index 466bef29a..956cc3933 100644 --- a/src/robusta/utils/auth_provider.py +++ b/src/robusta/utils/auth_provider.py @@ -10,7 +10,7 @@ class AuthProvider: def __init__(self): - logging.info(f"Loading RSA keys from {RSA_KEYS_PATH}") + logging.debug(f"Loading RSA keys from {RSA_KEYS_PATH}") self.prv: RSAPrivateKey = self.__class__._load_private_key(os.path.join(RSA_KEYS_PATH, "prv")) self.pub: RSAPublicKey = self.__class__._load_public_key(os.path.join(RSA_KEYS_PATH, "pub")) @@ -24,7 +24,7 @@ def get_public_rsa_key(self) -> RSAPublicKey: def _load_private_key(file_name: str) -> Optional[RSAPrivateKey]: try: if not os.path.isfile(file_name): - logging.info(f"no rsa private key at {file_name}") + logging.debug(f"no rsa private key at {file_name}") return None with open(file_name, "rb") as key_file: private_key = serialization.load_pem_private_key( @@ -42,7 +42,7 @@ def _load_private_key(file_name: str) -> Optional[RSAPrivateKey]: def _load_public_key(file_name: str) -> Optional[RSAPublicKey]: try: if not os.path.isfile(file_name): - logging.info(f"no rsa public key at {file_name}") + logging.debug(f"no rsa public key at {file_name}") return None with open(file_name, "rb") as key_file: public_key = serialization.load_pem_public_key(key_file.read()) From aa8372db6999f56c8cfe13c4c58ea178f23bb413 Mon Sep 17 00:00:00 2001 From: arik Date: Mon, 26 Feb 2024 11:10:46 +0200 Subject: [PATCH 3/7] Wrap running as sub-process with an env variable (#1309) * Wrap running as sub-process with an env variable When running from the ide, the server does not exit cleanly * CR comments --- src/robusta/core/model/env_vars.py | 2 ++ src/robusta/runner/process_setup.py | 27 +++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/robusta/core/model/env_vars.py b/src/robusta/core/model/env_vars.py index 198a77bcf..fb859279f 100644 --- a/src/robusta/core/model/env_vars.py +++ b/src/robusta/core/model/env_vars.py @@ -106,3 +106,5 @@ def load_bool(env_var, default: bool): IS_OPENSHIFT = load_bool("IS_OPENSHIFT", False) ENABLE_GRAPH_BLOCK = load_bool("ENABLE_GRAPH_BLOCK", False) + +RUN_AS_SUBPROCESS = load_bool("RUN_AS_SUBPROCESS", True) diff --git a/src/robusta/runner/process_setup.py b/src/robusta/runner/process_setup.py index a5ff4d404..3883f77ed 100644 --- a/src/robusta/runner/process_setup.py +++ b/src/robusta/runner/process_setup.py @@ -1,18 +1,21 @@ import os import sys +from robusta.core.model.env_vars import RUN_AS_SUBPROCESS + def process_setup(): - if os.fork(): - # Parent process, pid 1 in our deployment scenario. Wait (blocking - doesn't - # utilitze any CPU) for the forked "main" process to exit (if it ever does) - os.wait() - # At this point we are sure no subprocesses are running, so we can safely - # exit the pid 1 process, effectively causing the Docker image (and thus - # the k8s pod) to terminate. - sys.exit(1) + if RUN_AS_SUBPROCESS: + if os.fork(): + # Parent process, pid 1 in our deployment scenario. Wait (blocking - doesn't + # utilitze any CPU) for the forked "main" process to exit (if it ever does) + os.wait() + # At this point we are sure no subprocesses are running, so we can safely + # exit the pid 1 process, effectively causing the Docker image (and thus + # the k8s pod) to terminate. + sys.exit(1) - # Child process; create a process group to conveniently terminate the process - # along with subprocesses if need be via killpg. Currently the only use is in - # robusta.runner.config_loader.ConfigLoader.__reload_playbook_packages. - os.setpgrp() + # Child process; create a process group to conveniently terminate the process + # along with subprocesses if need be via killpg. Currently the only use is in + # robusta.runner.config_loader.ConfigLoader.__reload_playbook_packages. + os.setpgrp() From adf73a3a535df916ac641bdfae609e7d5891ca85 Mon Sep 17 00:00:00 2001 From: Robert Szefler Date: Mon, 26 Feb 2024 22:32:47 +0100 Subject: [PATCH 4/7] "scope" matching for sinks (#1298) * "scope" matching for sinks * simplify include/exclude handling for empty/undefined include/exclude specs * unit tests * docs * some unit tests for exact vs regex matching * fix matching infrastructure * scope test framework * minor fix * bug fixes * added more tests fixed bug on labels matcher * Changed the order of sections, so the scope approach appears first Some additional minor changes * some test-related code simplifications * fix tests * improved label handling, more unit tests * fix path in unit tests * minor fixes * more unit tests - label matching etc * some more tests --------- Co-authored-by: Arik Alon --- docs/configuration/configuring-sinks.rst | 146 +++++++- src/robusta/core/reporting/base.py | 75 +++- src/robusta/core/reporting/consts.py | 11 + src/robusta/core/sinks/sink_base.py | 5 +- src/robusta/core/sinks/sink_base_params.py | 31 ++ tests/scope_test_config.yaml | 383 +++++++++++++++++++++ tests/test_sink_scope.py | 205 +++++++++++ 7 files changed, 832 insertions(+), 24 deletions(-) create mode 100644 tests/scope_test_config.yaml create mode 100644 tests/test_sink_scope.py diff --git a/docs/configuration/configuring-sinks.rst b/docs/configuration/configuring-sinks.rst index 204814a92..4ee246ebe 100644 --- a/docs/configuration/configuring-sinks.rst +++ b/docs/configuration/configuring-sinks.rst @@ -18,7 +18,8 @@ Sinks are defined in Robusta's Helm chart, using the ``sinksConfig`` value: name: my_teams_sink # arbitrary name webhook_url: # a sink-specific parameter stop: false # optional (see `Routing Alerts to only one Sink`) - match: {} # optional routing rules (see below) + scope: {} # optional routing rules + match: {} # optional routing rules (deprecated; see below) default: true # optional (see below) To add a sink, update ``sinksConfig`` according to the instructions in :ref:`Sinks Reference`. Then do a :ref:`Helm Upgrade `. @@ -28,7 +29,7 @@ Integrate as many sinks as you like. .. _sink-matchers: -Routing Alerts to only one Sink +Routing Alerts to Only One Sink ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ By default, alerts are sent to all sinks that matches the alerts. @@ -44,17 +45,129 @@ The sinks evaluation order, is the order defined in ``generated_values.yaml``. name: production_sink slack_channel: production-notifications api_key: secret-key - match: - namespace: production + scope: + include: + - namespace: production stop: true -Routing Alerts to Specific Sinks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Routing Alerts To Specific Sinks +*************************************** + +Define which messages a sink accepts using ``scope``. + +For example, **Slack** can be integrated to receive high-severity messages in a specific namespace. Other messages will not be sent to this **Slack** sink. + +.. code-block:: yaml + + sinksConfig: + - slack_sink: + name: test_sink + slack_channel: test-notifications + api_key: secret-key + scope: + include: # more options available - see below + - namespace: [prod] + severity: HIGH + +Each attribute expression used in the ``scope`` specification can be 1 item, or a list, where each is either a regex or an exact match + +``Scope`` allows specifying a set of ``include`` and ``exclude`` sections: + +.. code-block:: yaml + + sinksConfig: + - slack_sink: + name: prod_slack_sink + slack_channel: prod-notifications + api_key: secret-key + scope: + # AND between namespace and labels, but OR within each selector + include: + - namespace: default + labels: "instance=1,foo!=x.*" + - namespace: bla + name: + - foo + - qux + exclude: + - type: ISSUE + title: .*crash.* + - name: bar[a-z]* + + +In order for a message to be sent to a ``Sink``, it must match **one of** the ``include`` sections, and **must not** match **all** the ``exclude`` sections. + +When multiple attributes conditions are present, all must be satisfied. + +The following attributes can be included in an ``include``/``excluded`` block: + +- ``title``: e.g. ``Crashing pod foo in namespace default`` +- ``name`` : the Kubernetes object name +- ``namespace``: the Kubernetes object namespace +- ``node`` : the Kubernetes node name +- ``severity``: one of ``INFO``, ``LOW``, ``MEDIUM``, ``HIGH`` +- ``type``: one of ``ISSUE``, ``CONF_CHANGE``, ``HEALTH_CHECK``, ``REPORT`` +- ``kind``: one of ``deployment``, ``node``, ``pod``, ``job``, ``daemonset`` +- ``source``: one of ``NONE``, ``KUBERNETES_API_SERVER``, ``PROMETHEUS``, ``MANUAL``, ``CALLBACK`` +- ``identifier``: e.g. ``report_crash_loop`` +- ``labels``: A comma separated list of ``key=val`` e.g. ``foo=bar,instance=123`` +- ``annotations``: A comma separated list of ``key=val`` e.g. ``app.kubernetes.io/name=prometheus`` + +.. note:: + + ``labels`` and ``annotations`` are both the Kubernetes resource labels and annotations + (e.g. pod labels) and the Prometheus alert labels and annotations. If both contains the + same label/annotation, the value from the Prometheus alert is preferred. + + +.. details:: How do I find the ``identifier`` value to use in a match block? (deprecated) + + For Prometheus alerts, it's always the alert name. + + .. TODO: update after we finish our improvements here: + .. For builtin APIServer alerts, it can vary, but common values are ``report_crash_loop``, ``image_pull_backoff_reporter``, ``ConfigurationChange/KubernetesResource/Change``, and ``job_failure``. + + For custom playbooks, it's the value you set in :ref:`create_finding` under ``aggregation_key``. + + Ask us in Slack if you need help. + +By default, every message is sent to every matching sink. To change this behaviour, you can mark a sink as :ref:`non-default `. + +The top-level mechanism works as follows: + +#. If the notification is **excluded** by any of the sink ``scope`` excludes - drop it +#. If the notification is **included** by any of the sink ``scope`` includes - accept it +#. If the notification is **included** by any of the sink ``matchers`` - accept it (Deprecated) + +Any of (but not both) of the ``include`` and ``exclude`` may be left undefined or empty. +An undefined/empty ``include`` section will effectively allow all alerts, and an +undefined/empty ``exclude`` section will not exclude anything. + +Inside the ``include`` and ``exclude`` section, at the topmost level, the consecutive +items act with the OR logic, meaning that it's enough to match a single item in the +list in order to allow/reject a message. The same applies to the items listed under +each attribute name. + +Within a specific ``labels`` or ``annotations`` expression, the logic is ``AND`` + +.. code-block:: yaml + + .... + scope: + include: + - labels: "instance=1,foo=x.*" + ..... + +The above requires that the ``instance`` will have a value of ``1`` **AND** the ``foo`` label values starts with ``x`` + +Match Section (Deprecated) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Define which messages a sink accepts using *matchers*. -For example, Slack can be integrated to receive high-severity messages in a specific namespace. Other messages will not be sent to Slack. +For example, Slack can be integrated to receive high-severity messages in a specific +namespace. Other messages will not be sent to Slack. .. code-block:: yaml @@ -86,11 +199,12 @@ The following attributes can be included in a *match* block: .. note:: - ``labels`` and ``annotations`` are both the Kubernetes resource labels and annotations (e.g. pod labels) and the Prometheus alert labels and annotations. - If both contains the same label/annotation, the value from the Prometheus alert is preferred. + ``labels`` and ``annotations`` are both the Kubernetes resource labels and annotations + (e.g. pod labels) and the Prometheus alert labels and annotations. If both contains the + same label/annotation, the value from the Prometheus alert is preferred. -.. details:: How do I find the ``identifier`` value to use in a match block? +.. details:: How do I find the ``identifier`` value to use in a match block? (deprecated) For Prometheus alerts, it's always the alert name. @@ -103,8 +217,8 @@ The following attributes can be included in a *match* block: By default, every message is sent to every matching sink. To change this behaviour, you can mark a sink as :ref:`non-default `. -Matches Can Be Lists Or Regexes -******************************************** +Match Section (Deprecated): Matches Can Be Lists or Regexes +*********************************************************** *match* rules support both regular expressions and lists of exact values: @@ -122,8 +236,8 @@ Matches Can Be Lists Or Regexes Regular expressions must be in `Python re module format `_, as passed to `re.match `_. -Matching Labels and Annotations -******************************************** +Match Section (Deprecated): Matching Labels and Annotations +*********************************************************** Special syntax is used for matching labels and annotations: @@ -139,8 +253,8 @@ Special syntax is used for matching labels and annotations: The syntax is similar to Kubernetes selectors, but only `=` conditions are allowed, not `!=` -Or Between Matches -******************************************** +Match Section (Deprecated): Or Between Matches +********************************************** You can use `Or` between *match* rules: diff --git a/src/robusta/core/reporting/base.py b/src/robusta/core/reporting/base.py index 5e30faf45..ec376caef 100644 --- a/src/robusta/core/reporting/base.py +++ b/src/robusta/core/reporting/base.py @@ -108,8 +108,13 @@ class Enrichment: enrichment_type: Optional[EnrichmentType] title: Optional[str] - def __init__(self, blocks: List[BaseBlock], annotations: Optional[Dict[str, str]] = None, - enrichment_type: Optional[EnrichmentType] = None, title: Optional[str] = None): + def __init__( + self, + blocks: List[BaseBlock], + annotations: Optional[Dict[str, str]] = None, + enrichment_type: Optional[EnrichmentType] = None, + title: Optional[str] = None, + ): if annotations is None: annotations = {} self.blocks = blocks @@ -146,17 +151,72 @@ def __value_match(value: Union[str, Dict[str, str]], expression: Union[str, Dict logging.error(f"Failed to evaluate matcher. Finding value: {value} matcher: {expression}") return False - def matches(self, requirements: Dict[str, Union[str, List[str]]]) -> bool: - invalid_attributes = self.get_invalid_attributes(list(requirements.keys())) + def matches(self, match_requirements: Dict[str, Union[str, List[str]]], scope_requirements) -> bool: + # 1. "scope" check + accept = True + if scope_requirements is not None: + if scope_requirements.exclude: + if self.scope_inc_exc_matches(scope_requirements.exclude): + return False + if scope_requirements.include: + if self.scope_inc_exc_matches(scope_requirements.include): + return True + else: # include was defined, but not matched. So if not matched by old matcher, should be rejected! + accept = False + + # 2. "match" check + invalid_attributes = self.get_invalid_attributes(list(match_requirements.keys())) if len(invalid_attributes) > 0: logging.warning(f"Invalid match attributes: {invalid_attributes}") return False - for attribute, expression in requirements.items(): + for attribute, expression in match_requirements.items(): if not self.attribute_matches(attribute, expression): return False + return accept + + def scope_inc_exc_matches(self, scope_inc_exc: Optional[list]): + return any(self.scope_matches(scope) for scope in scope_inc_exc) + + def scope_matches(self, scope: Dict[str, List[str]]): + # scope is e.g. {'labels': ['app=oomki.*,app!=X.*Y']} + # or {'name': ['pod-xyz.*'], 'title': ['fdc.*a', 'fdd.*b'], 'type': ['ISSUE']} + for attr_name, attr_matchers in scope.items(): + if not self.scope_attribute_matches(attr_name, attr_matchers): + return False + return True + + def scope_attribute_matches(self, attr_name: str, attr_matchers: List[str]): + if attr_name not in self.attribute_map: + raise ValueError(f'Scope match on unknown attribute "{attr_name}"') + attr_value = self.attribute_map[attr_name] + for attr_matcher in attr_matchers: + if attr_name in ["labels", "annotations"]: + return self.match_labels_annotations(attr_matcher, attr_value) + elif re.fullmatch(attr_matcher, attr_value): + return True + return False + + def match_labels_annotations(self, labels_match_expr: str, labels: Dict[str, str]): + for label_match in labels_match_expr.split(","): + if not self.label_matches(label_match, labels): + return False return True + def label_matches(self, label_match: str, labels: Dict[str, str]): + label_name, label_regex = label_match.split("=", 1) + label_name = label_name.strip() + label_regex = label_regex.strip() + if label_name.endswith("!"): # label_name!=match_expr + label_name = label_name[:-1].rstrip() + expect_match = False + else: + expect_match = True + label_value = labels.get(label_name) + if label_value is None: # no label with that name + return False + return bool(re.fullmatch(label_regex, label_value.strip())) == expect_match + class FindingSubject: def __init__( @@ -294,8 +354,9 @@ def add_enrichment( return if annotations is None: annotations = {} - self.enrichments.append(Enrichment(blocks=enrichment_blocks, annotations=annotations, - enrichment_type=enrichment_type, title=title)) + self.enrichments.append( + Enrichment(blocks=enrichment_blocks, annotations=annotations, enrichment_type=enrichment_type, title=title) + ) def add_video_link(self, video_link: VideoLink, suppress_warning: bool = False): if self.dirty and not suppress_warning: diff --git a/src/robusta/core/reporting/consts.py b/src/robusta/core/reporting/consts.py index fb2282e22..415bd74a9 100644 --- a/src/robusta/core/reporting/consts.py +++ b/src/robusta/core/reporting/consts.py @@ -9,6 +9,10 @@ class FindingType(Enum): HEALTH_CHECK = "health_check" REPORT = "report" + @classmethod + def from_type(cls, finding_type: str) -> "FindingType": + return cls(finding_type.lower()) + class FindingAggregationKey(Enum): NONE = None # empty default @@ -25,6 +29,13 @@ class FindingSource(Enum): CALLBACK = "callback" SCHEDULER = "scheduler" + @classmethod + def from_source(cls, source: str) -> "FindingSource": + try: + return cls(source.lower()) + except ValueError: + return cls.NONE + # Finding subject types class FindingSubjectType(Enum): diff --git a/src/robusta/core/sinks/sink_base.py b/src/robusta/core/sinks/sink_base.py index ec8be3192..458a69492 100644 --- a/src/robusta/core/sinks/sink_base.py +++ b/src/robusta/core/sinks/sink_base.py @@ -42,7 +42,10 @@ def stop(self): pass def accepts(self, finding: Finding) -> bool: - return finding.matches(self.params.match) and any(time_slice.is_active_now for time_slice in self.time_slices) + return ( + finding.matches(self.params.match, self.params.scope) + and any(time_slice.is_active_now for time_slice in self.time_slices) + ) def write_finding(self, finding: Finding, platform_enabled: bool): raise NotImplementedError(f"write_finding not implemented for sink {self.sink_name}") diff --git a/src/robusta/core/sinks/sink_base_params.py b/src/robusta/core/sinks/sink_base_params.py index 75a8a209e..bfbb8286f 100644 --- a/src/robusta/core/sinks/sink_base_params.py +++ b/src/robusta/core/sinks/sink_base_params.py @@ -55,11 +55,42 @@ def check_intervals(cls, intervals: List[ActivityInterval]): return intervals +ScopeIncludeExcludeParams = Dict[str, Optional[Union[str, List[str]]]] + + +class ScopeParams(BaseModel): + include: Optional[List[ScopeIncludeExcludeParams]] + exclude: Optional[List[ScopeIncludeExcludeParams]] + + @root_validator + def check_non_empty(cls, data: Dict) -> Dict: + if not (data.get("include") is not None or data.get("exclude") is not None): + raise ValueError("scope requires include and/or exclude subfield") + return data + + @root_validator + def check_and_normalize(cls, data: Dict) -> Dict: + """Check and normalize entries inside include/exclude""" + for key in ["include", "exclude"]: + entry = data[key] + if entry is None: + continue + if entry == []: + raise ValueError("scope include/exclude specification requires at least one matcher") + for inc_exc_params in entry: + for attr_name, regex_or_regexes in inc_exc_params.items(): + if isinstance(regex_or_regexes, str): + regex_or_regexes = [regex_or_regexes] + inc_exc_params[attr_name] = regex_or_regexes + return data + + class SinkBaseParams(BaseModel): name: str send_svg: bool = False default: bool = True match: dict = {} + scope: Optional[ScopeParams] activity: Optional[ActivityParams] stop: bool = False # Stop processing if this sink has been matched diff --git a/tests/scope_test_config.yaml b/tests/scope_test_config.yaml new file mode 100644 index 000000000..235f5e62f --- /dev/null +++ b/tests/scope_test_config.yaml @@ -0,0 +1,383 @@ +# include/exclude match fields: +# title: e.g. Crashing pod foo in namespace default +# name : the Kubernetes object name +# namespace: the Kubernetes object namespace +# kind: one of deployment, node, pod, job, daemonset +# node : the Kubernetes node name +# severity: one of INFO, LOW, MEDIUM, HIGH +# type: one of ISSUE, CONFIGURATION_CHANGE, HEALTH_CHECK, REPORT +# source: one of NONE, KUBERNETES_API_SERVER, PROMETHEUS, MANUAL, CALLBACK +# identifier: e.g. report_crash_loop +# labels: A comma separated list of key=val e.g. foo=bar,instance=123 +# annotations: A comma separated list of key=val e.g. app.kubernetes.io/name=prometheus +tests: + - scope: + include: + - title: the title + name: the-name + namespace: the-namespace + kind: deployment + node: the-node + severity: HIGH + type: REPORT + source: CALLBACK + identifier: crash_loop + labels: "label1=lab1, label2=lab2" + annotations: "annot1=ann1, annot2=ann2" + checks: + - message: "accept all fields match" + finding: + title: the title + subject: + name: the-name + namespace: the-namespace + kind: deployment + node: the-node + severity: HIGH + finding_type: REPORT + source: CALLBACK + aggregation_key: crash_loop + labels: + label1: lab1 + label2: lab2 + annotations: + annot1: ann1 + annot2: ann2 + expected: true + - message: "reject missing label" + finding: + title: the title + subject: + name: the-name + namespace: the-namespace + kind: deployment + node: the-node + severity: HIGH + finding_type: REPORT + source: CALLBACK + aggregation_key: crash_loop + labels: + label1: lab1 + annotations: + annot1: ann1 + annot2: ann2 + expected: false + - message: "reject wrong title" + finding: + title: title + subject: + name: the-name + namespace: the-namespace + kind: deployment + node: the-node + severity: HIGH + finding_type: REPORT + source: CALLBACK + aggregation_key: crash_loop + labels: + label1: lab1 + annotations: + annot1: ann1 + annot2: ann2 + expected: false + - scope: + include: + - name: xxx + namespace: yyy + exclude: + - name: zzz + checks: + - message: "accept by name xxx and ns yyy" + finding: + subject: + name: xxx + namespace: yyy + expected: true + - message: "reject with name xx and ns yy" + finding: + subject: + name: xx + namespace: yy + expected: false + - message: "reject with name zzz (exclude)" + finding: + subject: + name: zzz + namespace: yy + expected: false + - scope: + include: + - name: + - name1 + - name2 + namespace: + - ns1 + - ns2 + checks: + - message: "accept name1 ns2" + finding: + subject: + name: name1 + namespace: ns2 + expected: true + - message: "accept name1 ns1" + finding: + subject: + name: name1 + namespace: ns1 + expected: true + - message: "reject name1 ns3" + finding: + subject: + name: name1 + namespace: ns3 + expected: false + - message: "reject name3 ns3" + finding: + subject: + name: name3 + namespace: ns3 + expected: false + - scope: + include: + - name: + - .*myname.* + - his-name + title: + - the title + checks: + - message: "accept his-name and title" + finding: + title: the title + subject: + name: his-name + expected: true + - message: "accept xxmynamename1 and title" + finding: + title: the title + subject: + name: xxmynamename1 + expected: true + - message: "reject mynam and title" + finding: + title: the title + subject: + name: mynam + expected: false + - message: "reject mynam and bad title" + finding: + title: bad title + subject: + name: mynam + expected: false + - message: "reject myname and bad title" + finding: + title: bad title + subject: + name: myname + expected: false + - scope: + include: + - name: n.* + labels: "x = y, p=q " + checks: + - message: "some-message" + finding: + title: the title + subject: + name: name + labels: + x: y + p: zzz + expected: false + - scope: + include: + - name: name + labels: " x=y,p = q " + checks: + - message: "new message" + finding: + title: the title + subject: + name: name + labels: + x: y + p: q + expected: true + - scope: + include: + - labels: " x = y.*y , p != q[0-9] " + checks: + - message: "new message" + finding: + labels: + x: y + p: q + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + p: q + expected: true + - message: "new message" + finding: + labels: + x: yAAAy + p: q1 + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + p: q1111 + expected: true + - message: "new message" + finding: + labels: + x: yAAAy123 + p: q1111 + expected: false + - message: "new message" + finding: + labels: + p: q1111 + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + expected: false + - scope: + include: + - labels: " x = y.*y , p != q[0-9] " + exclude: + - labels: " x = y.*y , p != q[0-9] " + checks: + - message: "new message" + finding: + labels: + x: y + p: q + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + p: q + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + p: q1 + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + p: q1111 + expected: false + - message: "new message" + finding: + labels: + x: yAAAy123 + p: q1111 + expected: false + - message: "new message" + finding: + labels: + p: q1111 + expected: false + - message: "new message" + finding: + labels: + x: yAAAy + expected: false + - scope: + include: + - name: nam1 + namespace: ns1 + - name: nam2 + namespace: ns2 + exclude: + - name: nam3 + namespace: ns3 + checks: + - message: "nam1 ns1" + finding: + subject: + name: nam1 + namespace: ns1 + expected: true + - message: "nam2 ns2" + finding: + subject: + name: nam2 + namespace: ns2 + expected: true + - message: "nam3 ns3" + finding: + subject: + name: nam3 + namespace: ns3 + expected: false + - message: "nam4 ns4" + finding: + subject: + name: nam4 + namespace: ns4 + expected: false + - scope: + include: + - identifier: iden1 + kind: + - pod + - node + - identifier: iden2 + kind: + - deployment + - job + checks: + - message: "iden1 deployment" + finding: + aggregation_key: iden1 + subject: + kind: deployment + expected: false + - message: "iden2 pod" + finding: + aggregation_key: iden2 + subject: + kind: pod + expected: false + - message: "iden1 node" + finding: + aggregation_key: iden1 + subject: + kind: node + expected: true + - message: "iden1 daemonset" + finding: + aggregation_key: iden1 + subject: + kind: daemonset + expected: false + - message: "iden11 daemonset" + finding: + aggregation_key: iden1 + subject: + kind: daemonset + expected: false + - message: "iden11 node" + finding: + aggregation_key: iden11 + subject: + kind: node + expected: false + - message: "iden2 job" + finding: + aggregation_key: iden2 + subject: + kind: job + expected: true diff --git a/tests/test_sink_scope.py b/tests/test_sink_scope.py new file mode 100644 index 000000000..ab3be0616 --- /dev/null +++ b/tests/test_sink_scope.py @@ -0,0 +1,205 @@ +import unittest.mock +from typing import Dict, List, Optional +from unittest.mock import Mock + +import pytest +import yaml +from pydantic import BaseModel, Extra + +from robusta.core.reporting import Finding, FindingSeverity, FindingSource, FindingSubject +from robusta.core.reporting.consts import FindingSubjectType, FindingType +from robusta.core.sinks.sink_base import SinkBase +from robusta.core.sinks.sink_base_params import ScopeParams, SinkBaseParams + + +class CheckFindingSubject(BaseModel): + name: Optional[str] = "pod-xxx-yyy" + namespace: Optional[str] = "default" + kind: Optional[str] = "pod" + node: Optional[str] = None + + class Config: + extra = Extra.forbid + + +class CheckFinding(BaseModel): + title: str = "test finding" + subject: CheckFindingSubject = CheckFindingSubject() + labels: Dict[str, str] = {} + annotations: Dict[str, str] = {} + aggregation_key: str = "oom_kill" + severity: str = "INFO" + source: str = "NONE" + finding_type: str = "ISSUE" + + class Config: + extra = Extra.forbid + + def create_finding(self) -> Finding: + subject = FindingSubject( + name=self.subject.name, + namespace=self.subject.namespace, + subject_type=FindingSubjectType.from_kind(self.subject.kind), + node=self.subject.node, + labels=self.labels, + annotations=self.annotations, + ) + return Finding( + title=self.title, + subject=subject, + aggregation_key=self.aggregation_key, + severity=FindingSeverity.from_severity(self.severity), + source=FindingSource.from_source(self.source), + finding_type=FindingType.from_type(self.finding_type), + ) + + +class ScopeCheck(BaseModel): + finding: CheckFinding + expected: bool + message: str + + class Config: + extra = Extra.forbid + + +class ScopeTest(BaseModel): + scope: ScopeParams + checks: List[ScopeCheck] + + class Config: + extra = Extra.forbid + + +class _TestConfig(BaseModel): + tests: List[ScopeTest] + + +class TestScopeParams: + def test_scope_params_inc_and_exc_missing(self): + with pytest.raises(ValueError): + ScopeParams(include=None, exclude=None) + + @pytest.mark.parametrize( + "include,exclude", + [ + (None, []), + ([], None), + ([], []), + ], + ) + def test_scope_params_empty_inc_exc(self, include, exclude): + with pytest.raises(ValueError): + ScopeParams(include=include, exclude=exclude) + + @pytest.mark.parametrize( + "include_data,exclude_data,expected_include_data,expected_exclude_data", + [ + ( + None, + [{"labels": "xyz", "name": ["1", "2"]}], + None, + [{"labels": ["xyz"], "name": ["1", "2"]}], + ), + ( + [{"name": ".*", "labels": ["1", "2"]}], + None, + [{"name": [".*"], "labels": ["1", "2"]}], + None, + ), + ], + ) + def test_scope_params_normalization(self, include_data, exclude_data, expected_include_data, expected_exclude_data): + params = ScopeParams(include=include_data, exclude=exclude_data) + assert params.include == expected_include_data + assert params.exclude == expected_exclude_data + + +class TestSinkBase: + @pytest.mark.parametrize("matches_result,expected_result", [(True, True), (False, False)]) + def test_accepts(self, matches_result, expected_result): + sink_base = SinkBase(sink_params=SinkBaseParams(name="x"), registry=Mock()) + finding = Finding(title="y", aggregation_key="aaa") + finding.matches = Mock(return_value=matches_result) + # sink_base.time_slices is [TimeSliceAlways()] here, so the result will depend + # solely on matches_result. + assert sink_base.accepts(finding) is expected_result + + +class TestFilterable: + @pytest.fixture() + def get_invalid_attributes(self): + return Mock(return_value=[]) + + @pytest.fixture() + def finding(self, get_invalid_attributes): + finding = Finding(title="title", aggregation_key="ag_key") + with unittest.mock.patch.object(finding, "get_invalid_attributes", get_invalid_attributes): + yield finding + + @pytest.fixture() + def finding_with_data(self, finding): + finding.subject.labels = {"a": "x", "b": "fffy", "X": " hello "} + finding.subject.namespace = "ns12" + finding.title = "c1" + return finding + + def test_matches_no_scope_req(self, finding): + with unittest.mock.patch.object(finding, "scope_inc_exc_matches", Mock()) as mock_scope_inc_exc_matches: + finding.matches({}, None) + mock_scope_inc_exc_matches.assert_not_called() + finding.get_invalid_attributes.assert_called_once() + + @pytest.mark.parametrize( + "include,exclude,expected_output,match_req_evaluated", + [ + ([{"labels": "a=x,b=.*y"}], None, True, False), + ([{"labels": "a=q,b=.*y"}], None, False, True), + ([{"labels": "a=q,b=.*y"}, {"namespace": "ns12"}], None, True, False), + ([{"labels": "a=q,b=.*y"}, {"title": "c1"}], None, True, False), + ([{"labels": "a=q,b=.*y"}, {"title": "d[1-9]*"}], None, False, True), + (None, [{"labels": "a=x,b=.*y"}], False, False), + (None, [{"labels": "a=q,b=.*y"}, {"namespace": "ns12"}], False, False), + (None, [{"labels": "a=q,b=.*y"}, {"title": "c[1-9]"}], False, False), + (None, [{"labels": "a=q,b=.*y"}, {"title": "d[1-9]*"}], True, True), + ([{"namespace": "ns"}], None, False, True), + (None, [{"namespace": "ns"}], True, True), + ([{"labels": " a=x , b=.*y "}], None, True, False), + ([{"labels": "X=hello"}], None, True, False), + ([{"labels": "X=.*el.*"}], None, True, False), + ([{"labels": "X!=aaa"}], None, True, False), + ([{"labels": " X != aaa "}], None, True, False), + ], + ) + def test_matches_inc_match( + self, + finding_with_data, + get_invalid_attributes, + include, + exclude, + expected_output, + match_req_evaluated, + ): + assert finding_with_data.matches({}, ScopeParams(include=include, exclude=exclude)) is expected_output + # The asserts below check that the result has/has not been computed using scope params only and + # that match_requirements were not evaluated. It's not the cleanest, but to make it so would + # require major refactorings in Finding/Filterable. + if match_req_evaluated: + get_invalid_attributes.assert_called_once() + else: + get_invalid_attributes.assert_not_called() + + def test_matches_unknown_attr(self, finding_with_data): + with pytest.raises(ValueError): + finding_with_data.matches({}, ScopeParams(include=[{"xyzzfoo": "123"}], exclude=None)) + + def test_sink_scopes(self, finding): + with open("tests/scope_test_config.yaml") as test_config_file: + test_config = _TestConfig(**yaml.safe_load(test_config_file)) + + for scope_test in test_config.tests: + sink_base = SinkBase(sink_params=SinkBaseParams(name="x", scope=scope_test.scope), registry=Mock()) + + for check in scope_test.checks: + finding = check.finding.create_finding() + assert sink_base.accepts(finding) is check.expected, check.message From ca11c282ff92208c1388962891fb5cf94cce1d85 Mon Sep 17 00:00:00 2001 From: Ganesh Rathinavel Medayil <182092+ganeshrvel@users.noreply.github.com> Date: Tue, 27 Feb 2024 13:42:37 +0530 Subject: [PATCH 5/7] Enhanced crash logs events for: report_crash_loop, pod_oom_kill_enricher, image_pull_backoff_reporter, KubePodCrashLooping (#1282) * Enhanced crash logs events for: - report_crash_loop - pod_oom_kill_enricher - image_pull_backoff_reporter - KubePodCrashLooping * Fixed send_crash_report import error * Code cleanup. optimized the enrichments new pods counting logic * Fixed enrichment type Fixed crash pod log details * Better replica available text and better regex * Better description text * Better pod text * Pending pod markdown reverted. * Code cleaning * Removed RSA pair * Enhanced crash logs events for: - report_crash_loop - pod_oom_kill_enricher - image_pull_backoff_reporter - KubePodCrashLooping * Fixed send_crash_report import error * Code cleanup. optimized the enrichments new pods counting logic * Fixed enrichment type Fixed crash pod log details * Better replica available text and better regex * Better description text * Better pod text * Pending pod markdown reverted. * Code cleaning * Optimized the empty file handling Optimized Events handling * Optimized the empty file block --- .../robusta_playbooks/event_enrichments.py | 6 +- .../image_pull_backoff_enricher.py | 18 +--- .../pod_investigator_enricher.py | 85 +++++++++++------ .../restart_loop_reporter.py | 32 +------ src/robusta/api/__init__.py | 8 +- src/robusta/core/playbooks/crash_reporter.py | 65 +++++++++++++ .../core/playbooks/oom_killer_utils.py | 28 ++++-- .../playbooks/pod_utils/crashloop_utils.py | 83 ++++++++--------- .../playbooks/pod_utils/imagepull_utils.py | 92 ++++++++++--------- .../playbooks/pod_utils/pending_pod_utils.py | 21 +++-- src/robusta/core/reporting/__init__.py | 2 + src/robusta/core/reporting/base.py | 3 + src/robusta/core/reporting/blocks.py | 29 ++++++ .../sinks/robusta/dal/model_conversion.py | 22 ++++- src/robusta/integrations/msteams/sender.py | 2 +- 15 files changed, 321 insertions(+), 175 deletions(-) create mode 100644 src/robusta/core/playbooks/crash_reporter.py diff --git a/playbooks/robusta_playbooks/event_enrichments.py b/playbooks/robusta_playbooks/event_enrichments.py index 600ca7e44..dae643934 100644 --- a/playbooks/robusta_playbooks/event_enrichments.py +++ b/playbooks/robusta_playbooks/event_enrichments.py @@ -245,8 +245,10 @@ def deployment_events_enricher(event: DeploymentEvent, params: ExtendedEventEnri event.add_enrichment([events_table_block], {SlackAnnotations.ATTACHMENT: True}, enrichment_type=EnrichmentType.k8s_events, title="Deployment Events") else: - pods = list_pods_using_selector(dep.metadata.namespace, dep.spec.selector, "status.phase=Running") - event.add_enrichment([MarkdownBlock(f"*Replicas: Desired ({dep.spec.replicas}) --> Running ({len(pods)})*")]) + available_replicas = dep.status.availableReplicas if dep.status.availableReplicas else 0 + event.add_enrichment( + [MarkdownBlock(f"*Replicas: Desired ({dep.spec.replicas}) --> Running ({available_replicas})*")]) + events_table_block = get_resource_events_table( "*Deployment events:*", dep.kind, diff --git a/playbooks/robusta_playbooks/image_pull_backoff_enricher.py b/playbooks/robusta_playbooks/image_pull_backoff_enricher.py index e8ad07c0f..88e4c5e66 100755 --- a/playbooks/robusta_playbooks/image_pull_backoff_enricher.py +++ b/playbooks/robusta_playbooks/image_pull_backoff_enricher.py @@ -5,7 +5,6 @@ from hikaru.model.rel_1_26 import ContainerStatus, PodStatus from robusta.api import ( - BaseBlock, Finding, FindingSeverity, FindingSource, @@ -13,13 +12,11 @@ PodFindingSubject, RateLimitParams, action, - get_image_pull_backoff_blocks, + get_image_pull_backoff_enrichment, ) from robusta.core.playbooks.pod_utils.imagepull_utils import ( get_image_pull_backoff_container_statuses, - get_pod_issue_message_and_reason, ) -from robusta.core.reporting import MarkdownBlock def get_image_pull_backoff_container_statuses(status: PodStatus) -> List[ContainerStatus]: @@ -30,11 +27,6 @@ def get_image_pull_backoff_container_statuses(status: PodStatus) -> List[Contain ] -def decompose_flag(flag: Flag) -> List[Flag]: - members, _ = enum._decompose(flag.__class__, flag._value_) # type: ignore - return members - - @action def image_pull_backoff_reporter(event: PodEvent, action_params: RateLimitParams): """ @@ -55,7 +47,7 @@ def image_pull_backoff_reporter(event: PodEvent, action_params: RateLimitParams) pod_name = pod.metadata.name namespace = pod.metadata.namespace - blocks: List[BaseBlock] = get_image_pull_backoff_blocks(pod) + backoff_enrichment = get_image_pull_backoff_enrichment(pod) finding = Finding( title=f"Failed to pull at least one image in pod {pod_name} in namespace {namespace}", @@ -64,8 +56,6 @@ def image_pull_backoff_reporter(event: PodEvent, action_params: RateLimitParams) aggregation_key="image_pull_backoff_reporter", subject=PodFindingSubject(pod), ) - finding.add_enrichment(blocks) - message, reason = get_pod_issue_message_and_reason(pod) - if reason: - finding.add_enrichment([MarkdownBlock(f"{reason}: {message}")]) + finding.add_enrichment(backoff_enrichment.blocks, enrichment_type=backoff_enrichment.enrichment_type, + title=backoff_enrichment.title) event.add_finding(finding) diff --git a/playbooks/robusta_playbooks/pod_investigator_enricher.py b/playbooks/robusta_playbooks/pod_investigator_enricher.py index 7e2721102..79a2eb990 100755 --- a/playbooks/robusta_playbooks/pod_investigator_enricher.py +++ b/playbooks/robusta_playbooks/pod_investigator_enricher.py @@ -12,11 +12,12 @@ MarkdownBlock, action, build_selector_query, - get_crash_report_blocks, - get_image_pull_backoff_blocks, + get_crash_report_enrichments, + get_image_pull_backoff_enrichment, get_job_all_pods, - get_pending_pod_blocks, + get_pending_pod_enrichment, parse_kubernetes_datetime_to_ms, + Enrichment ) from robusta.core.playbooks.pod_utils.imagepull_utils import get_pod_issue_message_and_reason @@ -89,7 +90,7 @@ def is_crashlooping(pod: Pod) -> bool: for container_status in all_statuses if container_status.state.waiting is not None and container_status.restartCount > 1 - and "CrashloopBackOff" in container_status.state.waiting.reason + and "CrashLoopBackOff" in container_status.state.waiting.reason ] return len(crashlooping_containers) > 0 @@ -127,32 +128,64 @@ def has_image_pull_issue(pod: Pod) -> bool: return len(image_pull_statuses) > 0 +def get_pod_issue_explanation(event: KubernetesResourceEvent, issue: PodIssue, message: Optional[str], + reason: Optional[str]) -> str: + resource = event.get_resource() + + if issue == PodIssue.ImagePullBackoff: + issue_text = "image-pull-backoff" + elif issue == PodIssue.Pending: + issue_text = "scheduling issue" + elif issue in [PodIssue.CrashloopBackoff, PodIssue.Crashing]: + issue_text = "crash-looping" + else: + issue_text = issue.name + + # Information about number of available pods, and number of unavailable should be taken from the resource status + if resource.kind in ["Deployment", "StatefulSet", "DaemonSet"]: + unavailable_replicas = 0 + available_replicas = 0 + + if resource.kind in ["Deployment", "StatefulSet"]: + unavailable_replicas = resource.status.unavailableReplicas if resource.status.unavailableReplicas else 0 + available_replicas = resource.status.availableReplicas if resource.status.availableReplicas else 0 + elif resource.kind == "DaemonSet": + unavailable_replicas = resource.status.numberUnavailable if resource.status.numberUnavailable else 0 + available_replicas = resource.status.numberAvailable if resource.status.numberAvailable else 0 + + message_text = f"{available_replicas} pod(s) are available. {unavailable_replicas} pod(s) are not ready due to {issue_text}" + else: + message_text = f"Pod is not ready due to {issue_text}" + + if reason: + message_text += f"\n\n{reason}: {message if message else 'N/A'}" + + return message_text + + def report_pod_issue( event: KubernetesResourceEvent, pods: List[Pod], issue: PodIssue, message: Optional[str], reason: Optional[str] ): # find pods with issues pods_with_issue = [pod for pod in pods if detect_pod_issue(pod) == issue] - pod_names = [pod.metadata.name for pod in pods_with_issue] - expected_pods = get_expected_replicas(event) - message_string = f"{len(pod_names)}/{expected_pods} pod(s) are in {issue} state. " - resource = event.get_resource() - if resource.kind == "Job": - message_string = f"{len(pod_names)} pod(s) are in {issue} state. " - # no need to report here if len(pods) != expected_pods since there are mismatch enrichers + if len(pods_with_issue) < 1: + logging.debug(f"`pods_with_issue` for found for issue: {issue}") + return + + message_text = get_pod_issue_explanation(event=event, issue=issue, reason=reason, + message=message) - blocks: List[BaseBlock] = [MarkdownBlock(message_string)] # get blocks from specific pod issue - additional_blocks = get_pod_issue_blocks(pods_with_issue[0]) + pod_issues_enrichments = get_pod_issue_enrichments(pods_with_issue[0]) - if additional_blocks: - blocks.append(MarkdownBlock(f"\n\n*{pod_names[0]}* was picked for investigation\n")) - blocks.extend(additional_blocks) - event.add_enrichment(blocks) + if pod_issues_enrichments: + issues_enrichments = pod_issues_enrichments - if reason: - event.extend_description(f"{reason}: {message}") + for enrichment in issues_enrichments: + event.add_enrichment(enrichment.blocks, enrichment_type=enrichment.enrichment_type, title=enrichment.title) + event.extend_description(message_text) def get_expected_replicas(event: KubernetesResourceEvent) -> int: resource = event.get_resource() @@ -170,13 +203,13 @@ def get_expected_replicas(event: KubernetesResourceEvent) -> int: return 1 -def get_pod_issue_blocks(pod: Pod) -> Optional[List[BaseBlock]]: +def get_pod_issue_enrichments(pod: Pod) -> Optional[List[Enrichment]]: if has_image_pull_issue(pod): - return get_image_pull_backoff_blocks(pod) + enrichment = get_image_pull_backoff_enrichment(pod) + return [enrichment] elif is_pod_pending(pod): - return get_pending_pod_blocks(pod) - elif is_crashlooping(pod): - return get_crash_report_blocks(pod) - elif had_recent_crash(pod): - return get_crash_report_blocks(pod) + enrichment = get_pending_pod_enrichment(pod) + return [enrichment] + elif is_crashlooping(pod) or had_recent_crash(pod): + return get_crash_report_enrichments(pod) return None diff --git a/playbooks/robusta_playbooks/restart_loop_reporter.py b/playbooks/robusta_playbooks/restart_loop_reporter.py index 1e6bd9154..f0a34ef94 100644 --- a/playbooks/robusta_playbooks/restart_loop_reporter.py +++ b/playbooks/robusta_playbooks/restart_loop_reporter.py @@ -4,42 +4,16 @@ from hikaru.model.rel_1_26 import ContainerStatus, PodStatus from robusta.api import ( ActionParams, - BaseBlock, - Finding, - FindingSeverity, - FindingSource, NamedRegexPattern, PodEvent, - PodFindingSubject, RateLimiter, RateLimitParams, RegexReplacementStyle, action, - get_crash_report_blocks, + send_crash_report ) -def _send_crash_report( - event: PodEvent, - action_name: str, - regex_replacer_patterns: Optional[NamedRegexPattern] = None, - regex_replacement_style: Optional[RegexReplacementStyle] = None, -): - - pod = event.get_pod() - finding = Finding( - title=f"Crashing pod {pod.metadata.name} in namespace {pod.metadata.namespace}", - source=FindingSource.KUBERNETES_API_SERVER, - severity=FindingSeverity.HIGH, - aggregation_key=action_name, - subject=PodFindingSubject(pod), - ) - blocks: List[BaseBlock] = get_crash_report_blocks(pod, regex_replacer_patterns, regex_replacement_style) - - finding.add_enrichment(blocks) - event.add_finding(finding) - - class ReportCrashLoopParams(ActionParams): """ :var regex_replacer_patterns: regex patterns to replace text, for example for security reasons (Note: Replacements are executed in the given order) @@ -56,7 +30,7 @@ def report_crash_loop(event: PodEvent, params: ReportCrashLoopParams): regex_replacement_style = ( RegexReplacementStyle[params.regex_replacement_style] if params.regex_replacement_style else None ) - _send_crash_report(event, "report_crash_loop", params.regex_replacer_patterns, regex_replacement_style) + send_crash_report(event, "report_crash_loop", params.regex_replacer_patterns, regex_replacement_style) # The code below is deprecated. Please use the new crash loop action @@ -105,4 +79,4 @@ def restart_loop_reporter(event: PodEvent, config: RestartLoopParams): if not RateLimiter.mark_and_test("restart_loop_reporter", pod_name + pod.metadata.namespace, config.rate_limit): return - _send_crash_report(event, "restart_loop_reporter") + send_crash_report(event, "restart_loop_reporter") diff --git a/src/robusta/api/__init__.py b/src/robusta/api/__init__.py index 1ce40a73e..5aa763cc5 100644 --- a/src/robusta/api/__init__.py +++ b/src/robusta/api/__init__.py @@ -109,12 +109,13 @@ from robusta.core.playbooks.container_playbook_utils import create_container_graph from robusta.core.playbooks.job_utils import CONTROLLER_UID, get_job_all_pods, get_job_latest_pod, get_job_selector from robusta.core.playbooks.node_playbook_utils import create_node_graph_enrichment -from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_blocks +from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments from robusta.core.playbooks.pod_utils.imagepull_utils import ( - get_image_pull_backoff_blocks, + get_image_pull_backoff_enrichment, get_image_pull_backoff_container_statuses, ) -from robusta.core.playbooks.pod_utils.pending_pod_utils import get_pending_pod_blocks +from robusta.core.playbooks.pod_utils.pending_pod_utils import get_pending_pod_enrichment +from robusta.core.playbooks.crash_reporter import send_crash_report from robusta.core.playbooks.prometheus_enrichment_utils import ( XAxisLine, create_chart_from_prometheus_query, @@ -139,6 +140,7 @@ Emojis, Enrichment, FileBlock, + EmptyFileBlock, Filterable, Finding, FindingSeverity, diff --git a/src/robusta/core/playbooks/crash_reporter.py b/src/robusta/core/playbooks/crash_reporter.py new file mode 100644 index 000000000..3339183c5 --- /dev/null +++ b/src/robusta/core/playbooks/crash_reporter.py @@ -0,0 +1,65 @@ +import logging +from typing import Optional + +from robusta.core.model.base_params import NamedRegexPattern +from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments +from robusta.core.reporting import Finding, FindingSource, FindingSeverity, FileBlock, EmptyFileBlock +from robusta.core.reporting.base import EnrichmentType +from robusta.core.reporting.finding_subjects import PodFindingSubject +from robusta.integrations.kubernetes.autogenerated.events import PodEvent +from robusta.integrations.kubernetes.custom_models import RegexReplacementStyle + + +def send_crash_report( + event: PodEvent, + action_name: str, + regex_replacer_patterns: Optional[NamedRegexPattern] = None, + regex_replacement_style: Optional[RegexReplacementStyle] = None, +): + pod = event.get_pod() + + all_statuses = pod.status.containerStatuses + pod.status.initContainerStatuses + crashed_container_statuses = [ + container_status + for container_status in all_statuses + if container_status.state.waiting is not None and container_status.restartCount >= 1 + ] + + finding = Finding( + title=f"Crashing pod {pod.metadata.name} in namespace {pod.metadata.namespace}", + source=FindingSource.KUBERNETES_API_SERVER, + severity=FindingSeverity.HIGH, + aggregation_key=action_name, + subject=PodFindingSubject(pod), + ) + + enrichments = get_crash_report_enrichments(pod) + for enrichment in enrichments: + finding.add_enrichment(enrichment.blocks, + enrichment_type=enrichment.enrichment_type, + title=enrichment.title) + + for container_status in crashed_container_statuses: + try: + container_log = pod.get_logs( + container_status.name, + previous=True, + regex_replacer_patterns=regex_replacer_patterns, + regex_replacement_style=regex_replacement_style, + ) + + if not container_log: + log_block = EmptyFileBlock(filename=f"{pod.metadata.name}.log", + remarks=f"Logs unavailable for container: {container_status.name}") + logging.info( + f"could not fetch logs from container: {container_status.name}" + ) + else: + log_block = FileBlock(filename=f"{pod.metadata.name}.log", contents=container_log.encode()) + + finding.add_enrichment([log_block], + enrichment_type=EnrichmentType.text_file, title="Logs") + except Exception: + logging.error("Failed to get pod logs", exc_info=True) + + event.add_finding(finding) diff --git a/src/robusta/core/playbooks/oom_killer_utils.py b/src/robusta/core/playbooks/oom_killer_utils.py index 9bf6e4af7..4ce25d752 100644 --- a/src/robusta/core/playbooks/oom_killer_utils.py +++ b/src/robusta/core/playbooks/oom_killer_utils.py @@ -3,6 +3,7 @@ from robusta.api import ( ExecutionBaseEvent, + EmptyFileBlock, FileBlock, LogEnricherParams, MarkdownBlock, @@ -10,6 +11,7 @@ RegexReplacementStyle, RobustaPod, ) +from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments from robusta.core.reporting.base import EnrichmentType @@ -40,11 +42,19 @@ def start_log_enrichment( RegexReplacementStyle[params.regex_replacement_style] if params.regex_replacement_style else None ) + enrichments = get_crash_report_enrichments(pod) + for enrichment in enrichments: + event.add_enrichment(enrichment.blocks, + enrichment_type=enrichment.enrichment_type, + title=enrichment.title) + if not container and pod.spec.containers: # TODO do we want to keep this part of code? It used to sometimes report logs for a wrong # container when a container inside a pod was oomkilled. I can imagine it could cause # similar problems in other cases. container = pod.spec.containers[0].name + + log_data = "" for _ in range(tries - 1): log_data = pod.get_logs( container=container, @@ -57,15 +67,19 @@ def start_log_enrichment( logging.info("log data is empty, retrying...") time.sleep(backoff_seconds) continue + break - log_name = pod.metadata.name - log_name += f"/{container}" - event.add_enrichment( - [FileBlock(filename=f"{pod.metadata.name}.log", contents=log_data.encode())], - enrichment_type=EnrichmentType.text_file, - title="Pod Logs" + if not log_data: + log_block = EmptyFileBlock(filename=f"{pod.metadata.name}.log", + remarks=f"Logs unavailable for container: {container}") + logging.info( + f"could not fetch logs from container: {container}" ) - break + else: + log_block = FileBlock(filename=f"{pod.metadata.name}.log", contents=log_data.encode()) + + event.add_enrichment([log_block], + enrichment_type=EnrichmentType.text_file, title="Logs") def logs_enricher(event: PodEvent, params: LogEnricherParams): diff --git a/src/robusta/core/playbooks/pod_utils/crashloop_utils.py b/src/robusta/core/playbooks/pod_utils/crashloop_utils.py index b2965c8ac..b32e58c6c 100644 --- a/src/robusta/core/playbooks/pod_utils/crashloop_utils.py +++ b/src/robusta/core/playbooks/pod_utils/crashloop_utils.py @@ -1,57 +1,58 @@ -import logging from typing import List, Optional from hikaru.model.rel_1_26 import Pod -from robusta.core.model.base_params import NamedRegexPattern -from robusta.core.reporting import BaseBlock, FileBlock, MarkdownBlock -from robusta.integrations.kubernetes.custom_models import RegexReplacementStyle +from robusta.core.reporting import TableBlock +from robusta.core.reporting.base import Enrichment, EnrichmentType -def get_crash_report_blocks( +def get_crash_report_enrichments( pod: Pod, - regex_replacer_patterns: Optional[NamedRegexPattern] = None, - regex_replacement_style: Optional[RegexReplacementStyle] = None, -) -> List[BaseBlock]: +) -> List[Enrichment]: all_statuses = pod.status.containerStatuses + pod.status.initContainerStatuses crashed_container_statuses = [ container_status for container_status in all_statuses if container_status.state.waiting is not None and container_status.restartCount >= 1 ] - blocks: List[BaseBlock] = [] + + pod_issues_enrichments: List[Enrichment] = [] + for container_status in crashed_container_statuses: - blocks.append(MarkdownBlock(f"*{container_status.name}* restart count: {container_status.restartCount}")) - if container_status.state and container_status.state.waiting: - blocks.append( - MarkdownBlock(f"*{container_status.name}* waiting reason: {container_status.state.waiting.reason}") - ) + crash_info_rows: List[List[str]] = [] + prev_container_rows: List[List[str]] = [] + + crash_info_rows.append(["Container", container_status.name]) + crash_info_rows.append(["Restarts", container_status.restartCount]) + if container_status.state and container_status.state.terminated: - blocks.append( - MarkdownBlock( - f"*{container_status.name}* termination reason: {container_status.state.terminated.reason}" - ) - ) + if container_status.state.terminated.startedAt: + crash_info_rows.append(["Crashing since", container_status.state.terminated.startedAt]) + crash_info_rows.append(["TERMINATED", f"Reason: {container_status.state.terminated.reason}"]) + + if container_status.state and container_status.state.waiting: + crash_info_rows.append(["WAITING", f"Reason: {container_status.state.waiting.reason}"]) + if container_status.lastState and container_status.lastState.terminated: - blocks.append( - MarkdownBlock( - f"*{container_status.name}* termination reason: {container_status.lastState.terminated.reason}" - ) - ) - try: - container_log = pod.get_logs( - container_status.name, - previous=True, - regex_replacer_patterns=regex_replacer_patterns, - regex_replacement_style=regex_replacement_style, - ) - if container_log: - blocks.append(FileBlock(f"{pod.metadata.name}.txt", container_log)) - else: - blocks.append(MarkdownBlock(f"Container logs unavailable for container: {container_status.name}")) - logging.error( - f"could not fetch logs from container: {container_status.name}. logs were {container_log}" - ) - except Exception: - logging.error("Failed to get pod logs", exc_info=True) - return blocks + prev_container_rows.append(["TERMINATED", f"Reason: {container_status.lastState.terminated.reason}"]) + if container_status.lastState.terminated.startedAt: + prev_container_rows.append(["Started at", container_status.lastState.terminated.startedAt]) + if container_status.lastState.terminated.finishedAt: + prev_container_rows.append(["Finished at", container_status.lastState.terminated.finishedAt]) + + crash_info_table_block = TableBlock( + [[k, v] for (k, v) in crash_info_rows], + ["label", "value"], + table_name="*Crash Info*", + ) + prev_container_table_block = TableBlock( + [[k, v] for (k, v) in prev_container_rows], + ["label", "value"], + table_name="*Previous Container*", + ) + + pod_issues_enrichments.append(Enrichment(enrichment_type=EnrichmentType.crash_info, + title="Container Crash information", + blocks=[crash_info_table_block, prev_container_table_block])) + + return pod_issues_enrichments diff --git a/src/robusta/core/playbooks/pod_utils/imagepull_utils.py b/src/robusta/core/playbooks/pod_utils/imagepull_utils.py index 98264ca5d..6e5c4db78 100644 --- a/src/robusta/core/playbooks/pod_utils/imagepull_utils.py +++ b/src/robusta/core/playbooks/pod_utils/imagepull_utils.py @@ -7,15 +7,15 @@ from hikaru.model.rel_1_26 import ContainerStatus, Event, EventList, Pod, PodStatus -from robusta.core.reporting import BaseBlock, HeaderBlock, MarkdownBlock +from robusta.core.reporting import BaseBlock, MarkdownBlock, TableBlock +from robusta.core.reporting.base import EnrichmentType, Enrichment class ImagePullBackoffReason(Flag): Unknown = 0 RepoDoesntExist = 1 NotAuthorized = 2 - ImageDoesntExist = 4 - TagNotFound = 8 + Timeout = 16 def get_image_pull_backoff_container_statuses(status: PodStatus) -> List[ContainerStatus]: @@ -43,26 +43,19 @@ def get_pod_issue_message_and_reason(pod: Pod) -> Tuple[Optional[str], Optional[ return None, None -def decompose_flag(flag: Flag) -> List[Flag]: - members, _ = enum._decompose(flag.__class__, flag._value_) - return members - - -def get_image_pull_backoff_blocks(pod: Pod) -> Optional[List[BaseBlock]]: - blocks: List[BaseBlock] = [] +def get_image_pull_backoff_enrichment(pod: Pod) -> Enrichment: + error_blocks: List[BaseBlock] = [] + image_pull_table_blocks: List[BaseBlock] = [] pod_name = pod.metadata.name namespace = pod.metadata.namespace image_pull_backoff_container_statuses = get_image_pull_backoff_container_statuses(pod.status) investigator = ImagePullBackoffInvestigator(pod_name, namespace) + for container_status in image_pull_backoff_container_statuses: - investigation = investigator.investigate(container_status) + image_issue_rows: List[List[str]] = \ + [["Container", container_status.name], ["Image", container_status.image]] - blocks.extend( - [ - HeaderBlock(f"ImagePullBackOff in container {container_status.name}"), - MarkdownBlock(f"*Image:* {container_status.image}"), - ] - ) + investigation = investigator.investigate(container_status) # TODO: this happens when there is a backoff but the original events containing the actual error message are already gone # and all that remains is a backoff event without a detailed error message - maybe we should identify that case and @@ -85,26 +78,39 @@ def get_image_pull_backoff_blocks(pod: Pod) -> Optional[List[BaseBlock]]: reason = investigation.reason error_message = investigation.error_message - if reason != ImagePullBackoffReason.Unknown: - reasons = decompose_flag(reason) - - if len(reasons) == 1: - blocks.extend( - [ - MarkdownBlock(f"*Reason:* {reason}"), - ] - ) - else: - line_separated_reasons = "\n".join([f"{r}" for r in reasons]) - blocks.extend( - [ - MarkdownBlock(f"*Possible reasons:*\n{line_separated_reasons}"), - ] - ) + backoff_reason = __imagepull_backoff_reason_to_fix(reason=reason) + + if backoff_reason: + reason_text, fix = backoff_reason + image_issue_rows.append(["Reason", reason_text]) + image_issue_rows.append(["Fix", fix]) + else: - blocks.append(MarkdownBlock(f"*Error message:* {container_status.name}:\n{error_message}")) - return blocks + error_blocks.append(MarkdownBlock(f"*Error message:* {container_status.name}:\n{error_message}")) + + image_pull_table_blocks.append(TableBlock( + [[k, v] for (k, v) in image_issue_rows], + ["label", "value"], + )) + + image_pull_table_blocks.extend(error_blocks) + + return Enrichment( + enrichment_type=EnrichmentType.image_pull_backoff_info, + blocks=image_pull_table_blocks, + title="Container Image-Pull-Backoff Information") + + +def __imagepull_backoff_reason_to_fix(reason: ImagePullBackoffReason) -> Optional[Tuple[str, str]]: + if reason == ImagePullBackoffReason.RepoDoesntExist: + return "Image not found", "Make sure the image repository, image name and image tag are correct." + if reason == ImagePullBackoffReason.NotAuthorized: + return "Unauthorized", 'The repo is access protected. Make sure to configure the correct image pull secrets' + if reason == ImagePullBackoffReason.Timeout: + return "Timeout", 'If this does not resolved after a few minutes, make sure the image repository is responding.' + + return None class ImagePullOffInvestigation: @@ -121,9 +127,7 @@ class ImagePullBackoffInvestigator: # Containerd { "err_template": 'failed to pull and unpack image ".*?": failed to resolve reference ".*?": .*?: not found', - "reason": ImagePullBackoffReason.RepoDoesntExist - | ImagePullBackoffReason.ImageDoesntExist - | ImagePullBackoffReason.TagNotFound, + "reason": ImagePullBackoffReason.RepoDoesntExist, }, { "err_template": ( @@ -131,7 +135,7 @@ class ImagePullBackoffInvestigator: "pull access denied, repository does not exist or may require authorization: server message: " "insufficient_scope: authorization failed" ), - "reason": ImagePullBackoffReason.NotAuthorized | ImagePullBackoffReason.ImageDoesntExist, + "reason": ImagePullBackoffReason.NotAuthorized, }, { "err_template": ( @@ -146,11 +150,11 @@ class ImagePullBackoffInvestigator: "Error response from daemon: pull access denied for .*?, " "repository does not exist or may require 'docker login': denied: requested access to the resource is denied" ), - "reason": ImagePullBackoffReason.NotAuthorized | ImagePullBackoffReason.ImageDoesntExist, + "reason": ImagePullBackoffReason.NotAuthorized, }, { "err_template": "Error response from daemon: manifest for .*? not found: manifest unknown: manifest unknown", - "reason": ImagePullBackoffReason.TagNotFound, + "reason": ImagePullBackoffReason.RepoDoesntExist, }, { "err_template": ( @@ -161,8 +165,12 @@ class ImagePullBackoffInvestigator: }, { "err_template": 'Error response from daemon: manifest for .*? not found: manifest unknown: Failed to fetch ".*?"', - "reason": ImagePullBackoffReason.ImageDoesntExist | ImagePullBackoffReason.TagNotFound, + "reason": ImagePullBackoffReason.RepoDoesntExist, }, + { + "err_template": r'.*Timeout exceeded.*', + "reason": ImagePullBackoffReason.Timeout, # Using the Timeout reason + } ] def __init__(self, pod_name: str, namespace: str): diff --git a/src/robusta/core/playbooks/pod_utils/pending_pod_utils.py b/src/robusta/core/playbooks/pod_utils/pending_pod_utils.py index a883828e6..4f81ac653 100644 --- a/src/robusta/core/playbooks/pod_utils/pending_pod_utils.py +++ b/src/robusta/core/playbooks/pod_utils/pending_pod_utils.py @@ -7,17 +7,23 @@ from robusta.core.model.pods import pod_other_requests, pod_requests from robusta.core.playbooks.common import get_event_timestamp -from robusta.core.reporting.blocks import BaseBlock, MarkdownBlock +from robusta.core.reporting import Enrichment +from robusta.core.reporting.base import EnrichmentType +from robusta.core.reporting.blocks import BaseBlock, MarkdownBlock, TableBlock -def get_pending_pod_blocks(pod: Pod): - blocks: List[BaseBlock] = [] +def get_pending_pod_enrichment(pod: Pod) -> Enrichment: investigator = PendingInvestigator(pod) all_reasons = investigator.investigate() message = get_unscheduled_message(pod) - blocks.append(MarkdownBlock(f"Pod {pod.metadata.name} could not be scheduled.")) + pending_rows = [["Pod", pod.metadata.name]] if message: - blocks.append(MarkdownBlock(f"*Reason:* {message}")) + pending_rows.append(["Reason", message]) + + blocks = [TableBlock( + [[k, v] for (k, v) in pending_rows], + ["label", "value"], + )] if all_reasons: RESOURCE_REASONS = [ @@ -39,7 +45,10 @@ def get_pending_pod_blocks(pod: Pod): resources_string = ", ".join(request_resources) blocks.append(MarkdownBlock(f"*Pod requires:* {resources_string}")) - return blocks + return Enrichment( + enrichment_type=EnrichmentType.pending_pod_info, + blocks=blocks, + title="Unscheduled Pod Information") def get_unscheduled_message(pod: Pod) -> Optional[str]: diff --git a/src/robusta/core/reporting/__init__.py b/src/robusta/core/reporting/__init__.py index 0f1fdd795..7b68d20d2 100644 --- a/src/robusta/core/reporting/__init__.py +++ b/src/robusta/core/reporting/__init__.py @@ -19,6 +19,7 @@ EventsBlock, EventsRef, FileBlock, + EmptyFileBlock, HeaderBlock, JsonBlock, KubernetesDiffBlock, @@ -60,4 +61,5 @@ "EventsBlock", "EventsRef", "EventsRow", + "EmptyFileBlock", ] diff --git a/src/robusta/core/reporting/base.py b/src/robusta/core/reporting/base.py index ec376caef..48ce393b0 100644 --- a/src/robusta/core/reporting/base.py +++ b/src/robusta/core/reporting/base.py @@ -98,6 +98,9 @@ class EnrichmentType(Enum): alert_labels = "alert_labels" diff = "diff" text_file = "text_file" + crash_info = "crash_info" + image_pull_backoff_info = "image_pull_backoff_info" + pending_pod_info = "pending_pod_info" class Enrichment: diff --git a/src/robusta/core/reporting/blocks.py b/src/robusta/core/reporting/blocks.py index a4848132b..7dfa77130 100644 --- a/src/robusta/core/reporting/blocks.py +++ b/src/robusta/core/reporting/blocks.py @@ -130,6 +130,35 @@ def truncate_content(self, max_file_size_bytes: int) -> bytes: return "\n".join(truncated_lines).encode("utf-8") +class EmptyFileBlock(BaseBlock): + """ + Handle empty log files + """ + + metadata: dict = {} + filename: str + + def __init__( + self, + filename: str, + remarks: str, + metadata: Optional[dict] = None, + **kwargs, + ): + """ + :param filename: the file's name + :param contents: the file's contents + """ + super().__init__( + filename=filename, + metadata=metadata or {}, + **kwargs, + ) + + self.metadata["is_empty"] = True + self.metadata["remarks"] = remarks + + class HeaderBlock(BaseBlock): """ Text formatted as a header diff --git a/src/robusta/core/sinks/robusta/dal/model_conversion.py b/src/robusta/core/sinks/robusta/dal/model_conversion.py index 2309d82fa..dbc12ca6b 100644 --- a/src/robusta/core/sinks/robusta/dal/model_conversion.py +++ b/src/robusta/core/sinks/robusta/dal/model_conversion.py @@ -21,7 +21,7 @@ PrometheusBlock, TableBlock, ) -from robusta.core.reporting.blocks import GraphBlock +from robusta.core.reporting.blocks import GraphBlock, EmptyFileBlock from robusta.core.reporting.callbacks import ExternalActionRequestBuilder from robusta.core.sinks.transformer import Transformer from robusta.utils.parsing import datetime_to_db_str @@ -63,14 +63,26 @@ def to_finding_json(account_id: str, cluster_id: str, finding: Finding): return finding_json + @staticmethod + def get_file_type(filename: str): + last_dot_idx = filename.rindex(".") + return filename[last_dot_idx + 1:] + @staticmethod def get_file_object(block: FileBlock): - last_dot_idx = block.filename.rindex(".") return { - "type": block.filename[last_dot_idx + 1 :], + "type": ModelConversion.get_file_type(block.filename), "data": str(base64.b64encode(block.contents)), } + @staticmethod + def get_empty_file_object(block: EmptyFileBlock): + return { + "type": ModelConversion.get_file_type(block.filename), + "metadata": block.metadata, + "data": "", + } + @staticmethod def to_evidence_json( account_id: str, @@ -102,6 +114,8 @@ def to_evidence_json( if block.is_text_file(): block.zip() structured_data.append(ModelConversion.get_file_object(block)) + elif isinstance(block, EmptyFileBlock): + structured_data.append(ModelConversion.get_empty_file_object(block)) elif isinstance(block, FileBlock): if block.is_text_file(): block.zip() @@ -170,7 +184,7 @@ def to_evidence_json( elif isinstance(block, EventsRef): structured_data.append({"type": "events_ref", "data": block.dict()}) else: - logging.error(f"cannot convert block of type {type(block)} to robusta platform format block: {block}") + logging.warning(f"cannot convert block of type {type(block)} to robusta platform format block: {block}") continue # no reason to crash the entire report if not structured_data: diff --git a/src/robusta/integrations/msteams/sender.py b/src/robusta/integrations/msteams/sender.py index 1a0ba6dde..96c74f557 100644 --- a/src/robusta/integrations/msteams/sender.py +++ b/src/robusta/integrations/msteams/sender.py @@ -34,7 +34,7 @@ def __to_ms_teams(cls, block: BaseBlock, msg: MsTeamsMsg): elif isinstance(block, CallbackBlock): logging.error("CallbackBlock not supported for msteams") else: - logging.error(f"cannot convert block of type {type(block)} to msteams format block: {block}") + logging.warning(f"cannot convert block of type {type(block)} to msteams format block: {block}") @classmethod def __split_block_to_files_and_all_the_rest(cls, enrichment: Enrichment): From c9a0a0af31d57ecec984c46100528ee9f8f8efb1 Mon Sep 17 00:00:00 2001 From: Pavan Gudiwada <25551553+pavangudiwada@users.noreply.github.com> Date: Tue, 27 Feb 2024 18:04:38 +0530 Subject: [PATCH 6/7] Removed RSA references (#1312) Co-authored-by: Pavan Gudiwada --- docs/setup-robusta/configuration-secrets.rst | 3 --- docs/setup-robusta/gitops/argocd.rst | 3 --- docs/setup-robusta/gitops/flux.rst | 3 --- 3 files changed, 9 deletions(-) diff --git a/docs/setup-robusta/configuration-secrets.rst b/docs/setup-robusta/configuration-secrets.rst index 480f7989d..31cecb0cc 100644 --- a/docs/setup-robusta/configuration-secrets.rst +++ b/docs/setup-robusta/configuration-secrets.rst @@ -16,7 +16,6 @@ Robusta can pull values from Kubernetes secrets for: * Sink Configuration * Global Config * Action Parameters -* Robusta RSA keys To do so, first define an environment variable based on a Kubernetes secret. Add to Robusta's Helm values: @@ -41,5 +40,3 @@ Then reference that environment variable in other Helm values using the special Finally, make sure the Kubernetes secret actually exists. In this example, create a Secret named ``my-robusta-secrets`` with a ``secret_grafana_key`` value inside. - -For Robusta RSA keys, you can define the ``existingSecret`` parameter. The secret must have the `pub` and `prv` keys. diff --git a/docs/setup-robusta/gitops/argocd.rst b/docs/setup-robusta/gitops/argocd.rst index 7b8635954..e3585a86e 100644 --- a/docs/setup-robusta/gitops/argocd.rst +++ b/docs/setup-robusta/gitops/argocd.rst @@ -39,9 +39,6 @@ Example ``generated_values.yaml``: enablePlatformPlaybooks: true runner: sendAdditionalTelemetry: true - rsa: - prv: xxxxxx - pub: xxxxxx .. Options .. ^^^^^^^^^^^^^ diff --git a/docs/setup-robusta/gitops/flux.rst b/docs/setup-robusta/gitops/flux.rst index d9cf721f5..e42c3cb74 100644 --- a/docs/setup-robusta/gitops/flux.rst +++ b/docs/setup-robusta/gitops/flux.rst @@ -40,9 +40,6 @@ Example ``generated_values.yaml``: enablePlatformPlaybooks: true runner: sendAdditionalTelemetry: true - rsa: - prv: xxxxxx - pub: xxxxxx .. admonition:: Secrets handling :class: note From 68d19dea823bd3ffd1cfb2f9e1ff218af42d09b6 Mon Sep 17 00:00:00 2001 From: Robert Szefler Date: Wed, 28 Feb 2024 23:43:05 +0100 Subject: [PATCH 7/7] Improved SSH host key verification (#1313) * Improved SSH host key verification * improve CUSTOM_SSH_HOST_KEYS docs * call setup_host_keys only if an actual Git repo is requested --- .../external-playbook-repositories.rst | 49 +++++++++++++++++-- src/robusta/core/model/env_vars.py | 1 + src/robusta/integrations/git/git_repo.py | 40 ++++++++++++--- .../integrations/git/well_known_hosts.py | 16 ++++++ 4 files changed, 95 insertions(+), 11 deletions(-) create mode 100644 src/robusta/integrations/git/well_known_hosts.py diff --git a/docs/playbook-reference/defining-playbooks/external-playbook-repositories.rst b/docs/playbook-reference/defining-playbooks/external-playbook-repositories.rst index 843eefe11..933497da9 100644 --- a/docs/playbook-reference/defining-playbooks/external-playbook-repositories.rst +++ b/docs/playbook-reference/defining-playbooks/external-playbook-repositories.rst @@ -14,15 +14,58 @@ use in :ref:`customPlaybooks`. External actions are loaded using the ``playbookRepos`` Helm value, with either HTTPs or SSH. -If your repository is not in ``github.com`` or ``bitbucket.org`` (default verified domains), please add your repository domains: +If you are going to be using an external repository via HTTPS, you just need to configure +correct read access credentials (see below). When connecting via SSH, however, there is an +additional requirement to verify the remote host's identity on the client side, as SSH +generally does not provide any method of doing that automatically (in contrast with HTTPS, +which relies on the well established cryptographic infrastructure of certificates). + +In order to streamline the process of SSH host key verification, Robusta ships with verified +host keys for the following popular Git providers: + +* github.com +* gitlab.com +* bitbucket.org +* ssh.dev.azure.com + +If you are using a Git service outside of that list, you should add its SSH host keys in Robusta +configuration. This is done via the `CUSTOM_SSH_HOST_KEYS` environment variable with the list +of keys separated by newlines: .. code-block:: yaml runner: additional_env_vars: - - name: GIT_REPOS_VERIFIED_HOSTS - value: "ssh.dev.azure.com gitlab.com" + - name: CUSTOM_SSH_HOST_KEYS + # codeberg.org host keys + - value: | + |1|TVOSCWl9+tXzKniecqFzaidE+yA=|XgOrtH2kjzERBPrbC9aGbaisnDE= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8hZi7K1/2E2uBX8gwPRJAHvRAob+3Sn+y2hxiEhN0buv1igjYFTgFO2qQD8vLfU/HT/P/rqvEeTvaDfY1y/vcvQ8+YuUYyTwE2UaVU5aJv89y6PEZBYycaJCPdGIfZlLMmjilh/Sk8IWSEK6dQr+g686lu5cSWrFW60ixWpHpEVB26eRWin3lKYWSQGMwwKv4LwmW3ouqqs4Z4vsqRFqXJ/eCi3yhpT+nOjljXvZKiYTpYajqUC48IHAxTWugrKe1vXWOPxVXXMQEPsaIRc2hpK+v1LmfB7GnEGvF1UAKnEZbUuiD9PBEeD5a1MZQIzcoPWCrTxipEpuXQ5Tni4mN + |1|Zht5NJQx7c6F9fzemGK15ewk4lE=|D8ZMvKG5X9HEAUqWZaGJOwpBb7s= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBL2pDxWr18SoiDJCGZ5LmxPygTlPu+cCKSkpqkvCyQzl5xmIMeKNdfdBpfbCGDPoZQghePzFZkKJNR/v9Win3Sc= + |1|KMrl/f5rYsb8KkF7rHCwBuo49Do=|wkmvtUU1nQTyj+ZNyVqZlO0oP5o= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIVIC02vnjFyL+I4RHfvIGNtOgJMe769VTF1VR4EB3ZB + +Another option to automate host key verification is the `GIT_REPOS_VERIFIED_HOSTS` environment +variable. + +.. warning:: + + **DANGER ZONE** + Using the `GIT_REPOS_VERIFIED_HOSTS` variable is generally not recommended due to + security issues. Each host added this way will be automatically trusted *without* + an actual host key verification, potentially allowing man-in-the-middle attacks with + catastrophic implications. For more information, see + `here `_. + + Please make sure you know what you are doing before using this functionality. + +An example of using that configuration option: + +.. code-block:: yaml + + runner: + additional_env_vars: + - name: GIT_REPOS_VERIFIED_HOSTS + value: "ssh.yourhost.com ssh.anotherhost.com" Loading Actions from Public Git Repo ------------------------------------------ diff --git a/src/robusta/core/model/env_vars.py b/src/robusta/core/model/env_vars.py index fb859279f..797fb9628 100644 --- a/src/robusta/core/model/env_vars.py +++ b/src/robusta/core/model/env_vars.py @@ -17,6 +17,7 @@ def load_bool(env_var, default: bool): DEFAULT_PLAYBOOKS_PIP_INSTALL = load_bool("DEFAULT_PLAYBOOKS_PIP_INSTALL", False) CUSTOM_PLAYBOOKS_ROOT = os.path.join(PLAYBOOKS_ROOT, "storage") +CUSTOM_SSH_HOST_KEYS = os.environ.get("CUSTOM_SSH_HOST_KEYS", "") PLAYBOOKS_CONFIG_FILE_PATH = os.environ.get("PLAYBOOKS_CONFIG_FILE_PATH") diff --git a/src/robusta/integrations/git/git_repo.py b/src/robusta/integrations/git/git_repo.py index 8657af30c..200f6213b 100644 --- a/src/robusta/integrations/git/git_repo.py +++ b/src/robusta/integrations/git/git_repo.py @@ -9,7 +9,8 @@ from collections import defaultdict, namedtuple from typing import Dict, List -from robusta.core.model.env_vars import GIT_MAX_RETRIES +from robusta.core.model.env_vars import CUSTOM_SSH_HOST_KEYS, GIT_MAX_RETRIES +from robusta.integrations.git.well_known_hosts import WELL_KNOWN_HOST_KEYS GIT_DIR_NAME = "robusta-git" REPO_LOCAL_BASE_DIR = os.path.abspath(os.path.join(os.environ.get("REPO_LOCAL_BASE_DIR", "/app"), GIT_DIR_NAME)) @@ -25,6 +26,26 @@ class GitRepoManager: manager_lock = threading.Lock() repo_map = defaultdict(None) + host_keys_initialized = False + + @classmethod + def setup_host_keys(cls, custom_host_keys: List[str]): + if cls.host_keys_initialized: + return + + if not os.path.exists(SSH_ROOT_DIR): + os.mkdir(SSH_ROOT_DIR) + with open(f"{SSH_ROOT_DIR}/known_hosts", "w") as f: + for key in WELL_KNOWN_HOST_KEYS + custom_host_keys: + key = key.strip() + if key: + logging.debug(f"Adding a key to known_hosts: {key}") + f.write(key) + + if GIT_REPOS_VERIFIED_HOSTS: + os.system(f"ssh-keyscan -H {GIT_REPOS_VERIFIED_HOSTS} >> {SSH_ROOT_DIR}/known_hosts") + + cls.host_keys_initialized = True @staticmethod def get_git_repo(git_repo_url: str, git_key: str): @@ -41,10 +62,11 @@ def remove_git_repo(git_repo_url): with GitRepoManager.manager_lock: del GitRepoManager.repo_map[git_repo_url] - @staticmethod - def clear_git_repos(): + @classmethod + def clear_git_repos(cls): with GitRepoManager.manager_lock: GitRepoManager.repo_map.clear() + cls.host_keys_initialized = False SingleChange = namedtuple("SingleChange", "commit_date commit_message") @@ -60,10 +82,12 @@ def __init__(self, git_repo_url: str, git_key: str, git_branch: str = None): self.git_repo_url = git_repo_url self.env = os.environ.copy() self.git_branch = git_branch - ssh_key_option = "" + if git_key: # Add ssh key for non-public repositories key_file_name = self.init_key(git_key) ssh_key_option = f"-i {key_file_name}" + else: + ssh_key_option = "" self.env["GIT_SSH_COMMAND"] = f"ssh {ssh_key_option} -o IdentitiesOnly=yes" self.repo_lock = threading.RLock() @@ -82,11 +106,11 @@ def init_key(self, git_key): git_key = git_key + "\n" with open(key_file_name, "w") as key_file: + os.chmod(key_file_name, 0o400) key_file.write(textwrap.dedent(f"{git_key}")) - os.chmod(key_file_name, 0o400) - if not os.path.exists(SSH_ROOT_DIR): - os.mkdir(SSH_ROOT_DIR) - os.system(f"ssh-keyscan -H github.com bitbucket.org {GIT_REPOS_VERIFIED_HOSTS} >> {SSH_ROOT_DIR}/known_hosts") + + GitRepoManager.setup_host_keys(CUSTOM_SSH_HOST_KEYS.split("\n")) + return key_file_name @staticmethod diff --git a/src/robusta/integrations/git/well_known_hosts.py b/src/robusta/integrations/git/well_known_hosts.py new file mode 100644 index 000000000..8367346d1 --- /dev/null +++ b/src/robusta/integrations/git/well_known_hosts.py @@ -0,0 +1,16 @@ +WELL_KNOWN_HOST_KEYS = [ + # github.com + "|1|8/btn2nIvP08st0rffVzfBw/u7o=|7jWoObKOrX9Q68fkdIfHnvw4/xc= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=", + "|1|5pBsB/qQfvYG5u33Fe5wQw383XI=|8gDX+h6v0hpMrQrQIlKGVDjHQ98= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=", + "|1|a3CVdE3rzmO3nD3HMdYtX+ZS8yE=|QoMmPJiUV67E51Qf4jyceMAGHzs= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl", + # gitlab.com + "|1|LiZMgMrzgvu/B5SOn95fLbtM0DA=|XvW22X6+G9QDtY61+6w5ns5xEbo= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCsj2bNKTBSpIYDEGk9KxsGh3mySTRgMtXL583qmBpzeQ+jqCMRgBqB98u3z++J1sKlXHWfM9dyhSevkMwSbhoR8XIq/U0tCNyokEi/ueaBMCvbcTHhO7FcwzY92WK4Yt0aGROY5qX2UKSeOvuP4D6TPqKF1onrSzH9bx9XUf2lEdWT/ia1NEKjunUqu1xOB/StKDHMoX4/OKyIzuS0q/T1zOATthvasJFoPrAjkohTyaDUz2LN5JoH839hViyEG82yB+MjcFV5MU3N1l1QL3cVUCh93xSaua1N85qivl+siMkPGbO5xR/En4iEY6K2XPASUEMaieWVNTRCtJ4S8H+9", + "|1|YbnG/64qNFA2Cf9qt5BD2Aj4rO0=|kf1ipE8aTntrFxyHLAxZ/BQy7iU= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFSMqzJeV9rUzU4kWitGjeR4PWSa29SPqJ1fVkhtj3Hw9xjLVXVYrU9QlYWrOLXBpQ6KWjbjTDTdDkoohFzgbEY=" + "|1|g/BQ/0dlrf4oABFby7FWFPLvpiM=|WWrhkEzfupIH172XUsOlove4MPc= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAfuCHKVTjquxvt6CM6tdG4SLp1Btn/nOeHHE5UOzRdf", + # bitbucket.org + "|1|XoBV2bENlv6QLsqfAmAOW68cM9A=|kNOmO8JCtG47o4n967vPlWRxzJI= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDQeJzhupRu0u0cdegZIa8e86EG2qOCsIsD1Xw0xSeiPDlCr7kq97NLmMbpKTX6Esc30NuoqEEHCuc7yWtwp8dI76EEEB1VqY9QJq6vk+aySyboD5QF61I/1WeTwu+deCbgKMGbUijeXhtfbxSxm6JwGrXrhBdofTsbKRUsrN1WoNgUa8uqN1Vx6WAJw1JHPhglEGGHea6QICwJOAr/6mrui/oB7pkaWKHj3z7d1IC4KWLtY47elvjbaTlkN04Kc/5LFEirorGYVbt15kAUlqGM65pk6ZBxtaO3+30LVlORZkxOh+LKL/BvbZ/iRNhItLqNyieoQj/uh/7Iv4uyH/cV/0b4WDSd3DptigWq84lJubb9t/DnZlrJazxyDCulTmKdOR7vs9gMTo+uoIrPSb8ScTtvw65+odKAlBj59dhnVp9zd7QUojOpXlL62Aw56U4oO+FALuevvMjiWeavKhJqlR7i5n9srYcrNV7ttmDw7kf/97P5zauIhxcjX+xHv4M=", + "|1|aAanWvP59AXJG63mKQRaW3JrUGc=|pBIqK8MnUUisy4Cq5g+I+uIR6pw= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBPIQmuzMBuKdWeF4+a2sjSSpBK0iqitSQ+5BM9KhpexuGt20JpTVM7u5BDZngncgrqDMbWdxMWWOGtZ9UgbqgZE=", + "|1|8FbblX02OSTs6bOHkKv4ZsLBvjE=|9H2xv53vfCSHX7OQdAc1DnQzzjo= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIazEu89wgQZ4bqs3d63QSMzYVa0MuJ2e2gKTKqu+UUO", + # ssh.dev.azure.com + "|1|HlUhUTA+Je1obG7hPFVLGOhfWD0=|A+mrcbaWzLwVwxBiZaqrcHjTGhE= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC7Hr1oTWqNqOlzGJOfGJ4NakVyIzf1rXYd4d7wo6jBlkLvCA4odBlL0mDUyZ0/QUfTTqeu+tm22gOsv+VrVTMk6vwRU75gY/y9ut5Mb3bR5BV58dKXyq9A9UeB5Cakehn5Zgm6x1mKoVyf+FFn26iYqXJRgzIZZcZ5V6hrE0Qg39kZm4az48o0AUbf6Sp4SLdvnuMa2sVNwHBboS7EJkm57XQPVU3/QpyNLHbWDdzwtrlS+ez30S3AdYhLKEOxAG8weOnyrtLJAUen9mTkol8oII1edf7mWWbWVf0nBmly21+nZcmCTISQBtdcyPaEno7fFQMDD26/s0lfKob4Kw8H", +]