Skip to content

Commit

Permalink
filter predictor_data and add row limit warning
Browse files Browse the repository at this point in the history
  • Loading branch information
yusufuyanik1 committed Jan 7, 2025
1 parent d8eed96 commit 1dc4ad6
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 49 deletions.
9 changes: 6 additions & 3 deletions python/pdstools/adm/ADMDatamart.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,17 @@ def __init__(
self.model_data = self._validate_model_data(
model_df, query=query, extract_pyname_keys=extract_pyname_keys
)
predictor_df = self._validate_predictor_data(predictor_df)

# TODO shouldnt we subset the predictor data to the model IDs also in the model data - if that is present
self.predictor_data = self._validate_predictor_data(predictor_df)
if predictor_df is not None and self.model_data is not None:
model_ids = self.model_data.select("ModelID").unique()
self.predictor_data = predictor_df.join(model_ids, on="ModelID", how="semi")
else:
self.predictor_data = None

self.combined_data = self.aggregates._combine_data(
self.model_data, self.predictor_data
)

self.bin_aggregator = BinAggregator(dm=self) # attach after model_data

@classmethod
Expand Down
43 changes: 11 additions & 32 deletions python/pdstools/adm/Aggregates.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,50 +645,29 @@ def predictors_overview(self) -> Optional[pl.DataFrame]:
"""
try:
model_identifiers = ["Configuration"] + self.datamart.context_keys

predictor_summary = (
self.last(table="predictor_data")
.filter(pl.col("PredictorName") != "Classifier") # TODO not name, there is a type
.join(
self.last(table="model_data")
.select(["ModelID"] + model_identifiers)
.unique(),
on="ModelID",
how="left",
)
self.last(table="combined_data")
.filter(pl.col("EntryType") != "Classifier")
.group_by(model_identifiers + ["ModelID", "PredictorName"])
.agg(
pl.first("Type"),
pl.first("Performance"),
pl.count("BinIndex").alias("Bins"),
pl.first("EntryType"),
pl.count("BinIndex").alias("Bin Count"),
pl.first("Positives"),
pl.col("BinResponseCount")
.filter(pl.col("BinType") == "MISSING")
.sum()
.alias("Missing"),
pl.col("BinResponseCount")
.filter(pl.col("BinType") == "RESIDUAL")
.sum()
.alias("Residual"),
pl.first("Positives"),
.alias("Missing Bin Responses"),
pl.first("ResponseCount"),
)
.group_by(model_identifiers + ["PredictorName"])
.agg(
pl.first("Type"),
cdh_utils.weighted_average_polars("Performance", "ResponseCount"),
cdh_utils.weighted_average_polars("Bins", "ResponseCount"),
((pl.sum("Missing") / pl.sum("ResponseCount")) * 100).alias(
"Missing %"
),
((pl.sum("Residual") / pl.sum("ResponseCount")) * 100).alias(
"Residual %"
),
pl.sum("Positives"),
pl.sum("ResponseCount").alias("Responses"),
)
.fill_null(0)
.fill_nan(0)
.with_columns(pl.col("Bins").cast(pl.Int16))
.with_columns(
pl.col("Bin Count").cast(pl.Int16),
pl.col("Positives").cast(pl.Int64),
pl.col("ResponseCount").cast(pl.Int64),
)
)

return predictor_summary
Expand Down
32 changes: 24 additions & 8 deletions python/pdstools/adm/Reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,7 @@ def excel_report(
self,
name: Union[Path, str] = Path("Tables.xlsx"),
predictor_binning: bool = False,
query: Optional[QUERY] = None,
) -> Optional[Path]:
) -> tuple[Optional[Path], list[str]]:
"""
Export aggregated data to an Excel file.
This method exports the last snapshots of model_data, predictor summary,
Expand All @@ -441,12 +440,16 @@ def excel_report(
Returns
-------
Union[Path, None]
The path to the created Excel file if the export was successful,
None if no data was available to export.
tuple[Union[Path, None], list[str]]
A tuple containing:
- The path to the created Excel file if the export was successful, None if no data was available
- A list of warning messages (empty if no warnings)
"""
from xlsxwriter import Workbook

EXCEL_ROW_LIMIT = 1048576
warning_messages = []

name = Path(name)
tabs = {
"modeldata_last_snapshot": self.datamart.aggregates.last(table="model_data")
Expand All @@ -465,7 +468,7 @@ def excel_report(

if not tabs: # pragma: no cover
print("No data available to export.")
return None
return None, warning_messages

with Workbook(
name, options={"nan_inf_to_errors": True, "remove_timezone": True}
Expand All @@ -476,7 +479,20 @@ def excel_report(
.list.eval(pl.element().cast(pl.Utf8))
.list.join(", ")
)
data.collect().write_excel(workbook=wb, worksheet=tab)
data = data.collect()

if data.shape[0] > EXCEL_ROW_LIMIT:
warning_msg = (
f"The data for sheet '{tab}' exceeds Excel's row limit "
f"({data.shape[0]:,} rows > {EXCEL_ROW_LIMIT:,} rows). "
"This sheet will not be written to the Excel file. "
"Please filter your data before generating the Excel report."
)
warning_messages.append(warning_msg)
print(warning_msg)
continue
else:
data.write_excel(workbook=wb, worksheet=tab)

print(f"Data exported to {name}")
return name
return name, warning_messages
13 changes: 10 additions & 3 deletions python/pdstools/app/health_check/pages/3_Reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pdstools.utils.streamlit_utils import model_selection_df
from pdstools.utils.show_versions import show_versions
from pdstools.utils.cdh_utils import _apply_query
from pdstools import ADMDatamart

if "dm" not in st.session_state:
st.warning("Please configure your files in the `data import` tab.")
Expand Down Expand Up @@ -82,16 +83,22 @@
st.warning("Please upload Predictor Snapshot to include binning!")
if st.button("Create Tables"):
with st.spinner("Creating Tables..."):
filtered_datamart = ADMDatamart(
st.session_state["dm"].model_data,
st.session_state["dm"].predictor_data,
query=st.session_state.get("filters", None),
)
tablename = "ADMSnapshots.xlsx"
tables = st.session_state["dm"].generate.excel_report(
tables, warning_messages = filtered_datamart.generate.excel_report(
tablename,
predictor_binning=include_binning,
query=(st.session_state.get("filters", None)),
)
st.session_state["run"][st.session_state["runID"]]["tables"] = tablename
st.session_state["run"][st.session_state["runID"]]["tablefile"] = open(
tables, "rb"
)
for message in warning_messages:
st.warning(message)

btn = st.download_button(
label="Download additional tables",
Expand Down Expand Up @@ -146,7 +153,7 @@
value=True,
)
st.session_state["selected_models"] = edited_df.loc[
edited_df["Generate Report"] == True
edited_df["Generate Report"]
]["ModelID"].to_list()
st.write(f"{len(st.session_state['selected_models'])} models are selected")
if len(st.session_state["selected_models"]) > 0:
Expand Down
6 changes: 3 additions & 3 deletions python/tests/test_healthcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_GenerateHealthCheck(sample: ADMDatamart):


def test_ExportTables(sample: ADMDatamart):
excel = sample.generate.excel_report(predictor_binning=True)
excel, warning_messages = sample.generate.excel_report(predictor_binning=True)
assert excel == pathlib.Path("./Tables.xlsx")
assert excel.exists()
spreadsheet = ExcelFile(excel)
Expand All @@ -48,7 +48,7 @@ def test_ExportTables(sample: ADMDatamart):


def test_ExportTables_NoBinning(sample: ADMDatamart):
excel = sample.generate.excel_report(predictor_binning=False)
excel, warining_messages = sample.generate.excel_report(predictor_binning=False)
assert excel == pathlib.Path("./Tables.xlsx")
assert pathlib.Path(excel).exists()
spreadsheet = ExcelFile(excel)
Expand All @@ -73,7 +73,7 @@ def test_GenerateHealthCheck_ModelDataOnly(


def test_ExportTables_ModelDataOnly(sample_without_predictor_binning: ADMDatamart):
excel = sample_without_predictor_binning.generate.excel_report(
excel, warning_messages = sample_without_predictor_binning.generate.excel_report(
name="ModelTables.xlsx", predictor_binning=True
)
assert excel == pathlib.Path("ModelTables.xlsx")
Expand Down

0 comments on commit 1dc4ad6

Please sign in to comment.