filter predictor_data and add row limit warning

pegasystems · Jan 7, 2025 · 1dc4ad6 · 1dc4ad6
1 parent d8eed96
commit 1dc4ad6
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 49 deletions.
diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py
@@ -127,14 +127,17 @@ def __init__(
         self.model_data = self._validate_model_data(
             model_df, query=query, extract_pyname_keys=extract_pyname_keys
         )
+        predictor_df = self._validate_predictor_data(predictor_df)
 
-        # TODO shouldnt we subset the predictor data to the model IDs also in the model data - if that is present
-        self.predictor_data = self._validate_predictor_data(predictor_df)
+        if predictor_df is not None and self.model_data is not None:
+            model_ids = self.model_data.select("ModelID").unique()
+            self.predictor_data = predictor_df.join(model_ids, on="ModelID", how="semi")
+        else:
+            self.predictor_data = None
 
         self.combined_data = self.aggregates._combine_data(
             self.model_data, self.predictor_data
         )
-
         self.bin_aggregator = BinAggregator(dm=self)  # attach after model_data
 
     @classmethod

diff --git a/python/pdstools/adm/Aggregates.py b/python/pdstools/adm/Aggregates.py
@@ -645,50 +645,29 @@ def predictors_overview(self) -> Optional[pl.DataFrame]:
         """
         try:
             model_identifiers = ["Configuration"] + self.datamart.context_keys
-
             predictor_summary = (
-                self.last(table="predictor_data")
-                .filter(pl.col("PredictorName") != "Classifier") # TODO not name, there is a type
-                .join(
-                    self.last(table="model_data")
-                    .select(["ModelID"] + model_identifiers)
-                    .unique(),
-                    on="ModelID",
-                    how="left",
-                )
+                self.last(table="combined_data")
+                .filter(pl.col("EntryType") != "Classifier")
                 .group_by(model_identifiers + ["ModelID", "PredictorName"])
                 .agg(
                     pl.first("Type"),
                     pl.first("Performance"),
-                    pl.count("BinIndex").alias("Bins"),
+                    pl.first("EntryType"),
+                    pl.count("BinIndex").alias("Bin Count"),
+                    pl.first("Positives"),
                     pl.col("BinResponseCount")
                     .filter(pl.col("BinType") == "MISSING")
                     .sum()
-                    .alias("Missing"),
-                    pl.col("BinResponseCount")
-                    .filter(pl.col("BinType") == "RESIDUAL")
-                    .sum()
-                    .alias("Residual"),
-                    pl.first("Positives"),
+                    .alias("Missing Bin Responses"),
                     pl.first("ResponseCount"),
                 )
-                .group_by(model_identifiers + ["PredictorName"])
-                .agg(
-                    pl.first("Type"),
-                    cdh_utils.weighted_average_polars("Performance", "ResponseCount"),
-                    cdh_utils.weighted_average_polars("Bins", "ResponseCount"),
-                    ((pl.sum("Missing") / pl.sum("ResponseCount")) * 100).alias(
-                        "Missing %"
-                    ),
-                    ((pl.sum("Residual") / pl.sum("ResponseCount")) * 100).alias(
-                        "Residual %"
-                    ),
-                    pl.sum("Positives"),
-                    pl.sum("ResponseCount").alias("Responses"),
-                )
                 .fill_null(0)
                 .fill_nan(0)
-                .with_columns(pl.col("Bins").cast(pl.Int16))
+                .with_columns(
+                    pl.col("Bin Count").cast(pl.Int16),
+                    pl.col("Positives").cast(pl.Int64),
+                    pl.col("ResponseCount").cast(pl.Int64),
+                )
             )
 
             return predictor_summary

diff --git a/python/pdstools/adm/Reports.py b/python/pdstools/adm/Reports.py
@@ -421,8 +421,7 @@ def excel_report(
         self,
         name: Union[Path, str] = Path("Tables.xlsx"),
         predictor_binning: bool = False,
-        query: Optional[QUERY] = None,
-    ) -> Optional[Path]:
+    ) -> tuple[Optional[Path], list[str]]:
         """
         Export aggregated data to an Excel file.
         This method exports the last snapshots of model_data, predictor summary,
@@ -441,12 +440,16 @@ def excel_report(
 
         Returns
         -------
-        Union[Path, None]
-            The path to the created Excel file if the export was successful,
-            None if no data was available to export.
+        tuple[Union[Path, None], list[str]]
+            A tuple containing:
+            - The path to the created Excel file if the export was successful, None if no data was available
+            - A list of warning messages (empty if no warnings)
         """
         from xlsxwriter import Workbook
 
+        EXCEL_ROW_LIMIT = 1048576
+        warning_messages = []
+
         name = Path(name)
         tabs = {
             "modeldata_last_snapshot": self.datamart.aggregates.last(table="model_data")
@@ -465,7 +468,7 @@ def excel_report(
 
         if not tabs:  # pragma: no cover
             print("No data available to export.")
-            return None
+            return None, warning_messages
 
         with Workbook(
             name, options={"nan_inf_to_errors": True, "remove_timezone": True}
@@ -476,7 +479,20 @@ def excel_report(
                     .list.eval(pl.element().cast(pl.Utf8))
                     .list.join(", ")
                 )
-                data.collect().write_excel(workbook=wb, worksheet=tab)
+                data = data.collect()
+
+                if data.shape[0] > EXCEL_ROW_LIMIT:
+                    warning_msg = (
+                        f"The data for sheet '{tab}' exceeds Excel's row limit "
+                        f"({data.shape[0]:,} rows > {EXCEL_ROW_LIMIT:,} rows). "
+                        "This sheet will not be written to the Excel file. "
+                        "Please filter your data before generating the Excel report."
+                    )
+                    warning_messages.append(warning_msg)
+                    print(warning_msg)
+                    continue
+                else:
+                    data.write_excel(workbook=wb, worksheet=tab)
 
         print(f"Data exported to {name}")
-        return name
+        return name, warning_messages
diff --git a/python/pdstools/app/health_check/pages/3_Reports.py b/python/pdstools/app/health_check/pages/3_Reports.py
@@ -8,6 +8,7 @@
 from pdstools.utils.streamlit_utils import model_selection_df
 from pdstools.utils.show_versions import show_versions
 from pdstools.utils.cdh_utils import _apply_query
+from pdstools import ADMDatamart
 
 if "dm" not in st.session_state:
     st.warning("Please configure your files in the `data import` tab.")
@@ -82,16 +83,22 @@
             st.warning("Please upload Predictor Snapshot to include binning!")
         if st.button("Create Tables"):
             with st.spinner("Creating Tables..."):
+                filtered_datamart = ADMDatamart(
+                    st.session_state["dm"].model_data,
+                    st.session_state["dm"].predictor_data,
+                    query=st.session_state.get("filters", None),
+                )
                 tablename = "ADMSnapshots.xlsx"
-                tables = st.session_state["dm"].generate.excel_report(
+                tables, warning_messages = filtered_datamart.generate.excel_report(
                     tablename,
                     predictor_binning=include_binning,
-                    query=(st.session_state.get("filters", None)),
                 )
                 st.session_state["run"][st.session_state["runID"]]["tables"] = tablename
                 st.session_state["run"][st.session_state["runID"]]["tablefile"] = open(
                     tables, "rb"
                 )
+                for message in warning_messages:
+                    st.warning(message)
 
             btn = st.download_button(
                 label="Download additional tables",
@@ -146,7 +153,7 @@
                 value=True,
             )
             st.session_state["selected_models"] = edited_df.loc[
-                edited_df["Generate Report"] == True
+                edited_df["Generate Report"]
             ]["ModelID"].to_list()
             st.write(f"{len(st.session_state['selected_models'])} models are selected")
             if len(st.session_state["selected_models"]) > 0:

diff --git a/python/tests/test_healthcheck.py b/python/tests/test_healthcheck.py
@@ -32,7 +32,7 @@ def test_GenerateHealthCheck(sample: ADMDatamart):
 
 
 def test_ExportTables(sample: ADMDatamart):
-    excel = sample.generate.excel_report(predictor_binning=True)
+    excel, warning_messages = sample.generate.excel_report(predictor_binning=True)
     assert excel == pathlib.Path("./Tables.xlsx")
     assert excel.exists()
     spreadsheet = ExcelFile(excel)
@@ -48,7 +48,7 @@ def test_ExportTables(sample: ADMDatamart):
 
 
 def test_ExportTables_NoBinning(sample: ADMDatamart):
-    excel = sample.generate.excel_report(predictor_binning=False)
+    excel, warining_messages = sample.generate.excel_report(predictor_binning=False)
     assert excel == pathlib.Path("./Tables.xlsx")
     assert pathlib.Path(excel).exists()
     spreadsheet = ExcelFile(excel)
@@ -73,7 +73,7 @@ def test_GenerateHealthCheck_ModelDataOnly(
 
 
 def test_ExportTables_ModelDataOnly(sample_without_predictor_binning: ADMDatamart):
-    excel = sample_without_predictor_binning.generate.excel_report(
+    excel, warning_messages = sample_without_predictor_binning.generate.excel_report(
         name="ModelTables.xlsx", predictor_binning=True
     )
     assert excel == pathlib.Path("ModelTables.xlsx")