Skip to content

Commit

Permalink
Rebase changes from Quantco#543
Browse files Browse the repository at this point in the history
  • Loading branch information
zmbc committed Oct 29, 2024
1 parent 190b266 commit df10408
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 10 deletions.
12 changes: 12 additions & 0 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,7 @@ def __init__(
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
use_sparse_hessian: bool = True,
):
self.l1_ratio = l1_ratio
self.P1 = P1
Expand Down Expand Up @@ -795,6 +796,7 @@ def __init__(
self.categorical_format = categorical_format
self.cat_missing_method = cat_missing_method
self.cat_missing_name = cat_missing_name
self.use_sparse_hessian = use_sparse_hessian

@property
def family_instance(self) -> ExponentialDispersionModel:
Expand Down Expand Up @@ -1112,13 +1114,15 @@ def _solve(
lower_bounds=lower_bounds,
upper_bounds=upper_bounds,
verbose=self.verbose > 0,
use_sparse_hessian=self.use_sparse_hessian,
)
if self._solver == "irls-ls":
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_least_squares_solver, coef, irls_data
)
# 4.2 coordinate descent ##############################################
elif self._solver == "irls-cd":
# This is the case we're concerned with for wide problems.
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_cd_solver, coef, irls_data
)
Expand Down Expand Up @@ -2880,6 +2884,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``. Only used if ``X`` is a pandas data frame.
use_sparse_hessian : boolean, optional, (default=True)
If ``True``, stores the current state of the Hessian as a sparse COO matrix.
If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
`num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
issues when working with extremely wide data.
Attributes
----------
coef_ : numpy.array, shape (n_features,)
Expand Down Expand Up @@ -2965,6 +2975,7 @@ def __init__(
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
use_sparse_hessian: bool = True,
):
self.alphas = alphas
self.alpha = alpha
Expand Down Expand Up @@ -3005,6 +3016,7 @@ def __init__(
categorical_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
use_sparse_hessian=use_sparse_hessian,
)

def _validate_hyperparameters(self) -> None:
Expand Down
68 changes: 58 additions & 10 deletions src/glum/_solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def update_hessian(state, data, active_set):
# 2) use all the rows
# 3) Include the P2 components
# 4) just like an update, we only update the active_set

hessian_init = build_hessian_delta(
data.X,
state.hessian_rows,
Expand All @@ -149,16 +150,33 @@ def update_hessian(state, data, active_set):
np.arange(data.X.shape[0], dtype=np.int32),
active_set,
)
state.hessian[np.ix_(active_set, active_set)] = hessian_init

# In the sparse Hessian case, state.hessian is a coo_matrix.
# hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
# we can convert this to a coo_matrix and simply add it to the state.hessian
if data.use_sparse_hessian:
coo_data = hessian_init.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian = sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] = hessian_init

state.hessian_initialized = True
n_active_rows = data.X.shape[0]

else:
# In an update iteration, we want to:
# 1) use the difference in hessian_rows from the last iteration
# 2) filter for active_rows in case data.hessian_approx != 0
# 3) Ignore the P2 components because those don't change and have
# already been added
# 4) only update the active set subset of the hessian.

hessian_rows_diff, active_rows = identify_active_rows(
state.gradient_rows,
state.hessian_rows,
Expand All @@ -173,13 +191,32 @@ def update_hessian(state, data, active_set):
active_rows=active_rows,
active_cols=active_set,
)
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

if data.use_sparse_hessian:
coo_data = hessian_delta.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian += sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

n_active_rows = active_rows.shape[0]

return (
state.hessian[np.ix_(active_set, active_set)],
n_active_rows,
)
# If state.hessian is in COO form, we have to convert to CSC in order to index the
# active set elements, which we then convert to a numpy array for compatibility with
# the rest of the code. This numpy array is dense, but we care about problems in
# which the active set is usually much smaller than the number of data columns.
if data.use_sparse_hessian:
return (
state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
n_active_rows,
)
else:
return state.hessian[np.ix_(active_set, active_set)], n_active_rows


def _is_subset(x, y):
Expand Down Expand Up @@ -268,7 +305,7 @@ def _irls_solver(inner_solver, coef, data) -> tuple[np.ndarray, int, int, list[l
----------
inner_solver
A least squares solver that can handle the appropriate penalties. With
an L1 penalty, this will _cd_solver. With only an L2 penalty,
an L1 penalty, this will be _cd_solver. With only an L2 penalty,
_least_squares_solver will be more efficient.
coef : ndarray, shape (c,)
If fit_intercept=False, shape c=X.shape[1].
Expand Down Expand Up @@ -432,6 +469,7 @@ def __init__(
lower_bounds: Optional[np.ndarray] = None,
upper_bounds: Optional[np.ndarray] = None,
verbose: bool = False,
use_sparse_hessian: bool = True,
):
self.X = X
self.y = y
Expand Down Expand Up @@ -463,6 +501,7 @@ def __init__(

self.intercept_offset = 1 if self.fit_intercept else 0
self.verbose = verbose
self.use_sparse_hessian = use_sparse_hessian

self._check_data()

Expand Down Expand Up @@ -525,9 +564,18 @@ def __init__(self, coef, data):
self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype)
self.gradient_rows = None
self.hessian_rows = None
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

# In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
# can instantiate the Hessian as a sparse COO matrix.
if data.use_sparse_hessian:
self.hessian = sparse.coo_matrix(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)
else:
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

self.hessian_initialized = False
self.coef_P2 = None
self.norm_min_subgrad = None
Expand Down

0 comments on commit df10408

Please sign in to comment.