Rebase changes from Quantco#543

zmbc · Oct 29, 2024 · df10408 · df10408
1 parent 190b266
commit df10408
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 10 deletions.
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -758,6 +758,7 @@ def __init__(
         categorical_format: str = "{name}[{category}]",
         cat_missing_method: str = "fail",
         cat_missing_name: str = "(MISSING)",
+        use_sparse_hessian: bool = True,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -795,6 +796,7 @@ def __init__(
         self.categorical_format = categorical_format
         self.cat_missing_method = cat_missing_method
         self.cat_missing_name = cat_missing_name
+        self.use_sparse_hessian = use_sparse_hessian
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -1112,13 +1114,15 @@ def _solve(
                 lower_bounds=lower_bounds,
                 upper_bounds=upper_bounds,
                 verbose=self.verbose > 0,
+                use_sparse_hessian=self.use_sparse_hessian,
             )
             if self._solver == "irls-ls":
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
                     _least_squares_solver, coef, irls_data
                 )
             # 4.2 coordinate descent ##############################################
             elif self._solver == "irls-cd":
+                # This is the case we're concerned with for wide problems.
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
                     _cd_solver, coef, irls_data
                 )
@@ -2880,6 +2884,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         Name of the category to which missing values will be converted if
         ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
 
+    use_sparse_hessian : boolean, optional, (default=True)
+        If ``True``, stores the current state of the Hessian as a sparse COO matrix.
+        If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
+        `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
+        issues when working with extremely wide data.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2965,6 +2975,7 @@ def __init__(
         categorical_format: str = "{name}[{category}]",
         cat_missing_method: str = "fail",
         cat_missing_name: str = "(MISSING)",
+        use_sparse_hessian: bool = True,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -3005,6 +3016,7 @@ def __init__(
             categorical_format=categorical_format,
             cat_missing_method=cat_missing_method,
             cat_missing_name=cat_missing_name,
+            use_sparse_hessian=use_sparse_hessian,
         )
 
     def _validate_hyperparameters(self) -> None:

diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
@@ -141,6 +141,7 @@ def update_hessian(state, data, active_set):
         # 2) use all the rows
         # 3) Include the P2 components
         # 4) just like an update, we only update the active_set
+
         hessian_init = build_hessian_delta(
             data.X,
             state.hessian_rows,
@@ -149,16 +150,33 @@ def update_hessian(state, data, active_set):
             np.arange(data.X.shape[0], dtype=np.int32),
             active_set,
         )
-        state.hessian[np.ix_(active_set, active_set)] = hessian_init
+
+        # In the sparse Hessian case, state.hessian is a coo_matrix.
+        # hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
+        # we can convert this to a coo_matrix and simply add it to the state.hessian
+        if data.use_sparse_hessian:
+            coo_data = hessian_init.ravel(order="C")
+            coo_rows = np.repeat(active_set, active_set.shape[0])
+            coo_cols = np.tile(active_set, active_set.shape[0])
+
+            state.hessian = sparse.coo_matrix(
+                (coo_data, (coo_rows, coo_cols)),
+                shape=(state.coef.shape[0], state.coef.shape[0]),
+            )
+        else:
+            state.hessian[np.ix_(active_set, active_set)] = hessian_init
+
         state.hessian_initialized = True
         n_active_rows = data.X.shape[0]
+
     else:
         # In an update iteration, we want to:
         # 1) use the difference in hessian_rows from the last iteration
         # 2) filter for active_rows in case data.hessian_approx != 0
         # 3) Ignore the P2 components because those don't change and have
         #    already been added
         # 4) only update the active set subset of the hessian.
+
         hessian_rows_diff, active_rows = identify_active_rows(
             state.gradient_rows,
             state.hessian_rows,
@@ -173,13 +191,32 @@ def update_hessian(state, data, active_set):
             active_rows=active_rows,
             active_cols=active_set,
         )
-        state.hessian[np.ix_(active_set, active_set)] += hessian_delta
+
+        if data.use_sparse_hessian:
+            coo_data = hessian_delta.ravel(order="C")
+            coo_rows = np.repeat(active_set, active_set.shape[0])
+            coo_cols = np.tile(active_set, active_set.shape[0])
+
+            state.hessian += sparse.coo_matrix(
+                (coo_data, (coo_rows, coo_cols)),
+                shape=(state.coef.shape[0], state.coef.shape[0]),
+            )
+        else:
+            state.hessian[np.ix_(active_set, active_set)] += hessian_delta
+
         n_active_rows = active_rows.shape[0]
 
-    return (
-        state.hessian[np.ix_(active_set, active_set)],
-        n_active_rows,
-    )
+    # If state.hessian is in COO form, we have to convert to CSC in order to index the
+    # active set elements, which we then convert to a numpy array for compatibility with
+    # the rest of the code. This numpy array is dense, but we care about problems in
+    # which the active set is usually much smaller than the number of data columns.
+    if data.use_sparse_hessian:
+        return (
+            state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
+            n_active_rows,
+        )
+    else:
+        return state.hessian[np.ix_(active_set, active_set)], n_active_rows
 
 
 def _is_subset(x, y):
@@ -268,7 +305,7 @@ def _irls_solver(inner_solver, coef, data) -> tuple[np.ndarray, int, int, list[l
     ----------
     inner_solver
         A least squares solver that can handle the appropriate penalties. With
-        an L1 penalty, this will _cd_solver. With only an L2 penalty,
+        an L1 penalty, this will be _cd_solver. With only an L2 penalty,
         _least_squares_solver will be more efficient.
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
@@ -432,6 +469,7 @@ def __init__(
         lower_bounds: Optional[np.ndarray] = None,
         upper_bounds: Optional[np.ndarray] = None,
         verbose: bool = False,
+        use_sparse_hessian: bool = True,
     ):
         self.X = X
         self.y = y
@@ -463,6 +501,7 @@ def __init__(
 
         self.intercept_offset = 1 if self.fit_intercept else 0
         self.verbose = verbose
+        self.use_sparse_hessian = use_sparse_hessian
 
         self._check_data()
 
@@ -525,9 +564,18 @@ def __init__(self, coef, data):
         self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype)
         self.gradient_rows = None
         self.hessian_rows = None
-        self.hessian = np.zeros(
-            (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
-        )
+
+        # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
+        # can instantiate the Hessian as a sparse COO matrix.
+        if data.use_sparse_hessian:
+            self.hessian = sparse.coo_matrix(
+                (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
+            )
+        else:
+            self.hessian = np.zeros(
+                (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
+            )
+
         self.hessian_initialized = False
         self.coef_P2 = None
         self.norm_min_subgrad = None