From 1710e4e14a75fc91ade39de3f08933910868818f Mon Sep 17 00:00:00 2001 From: LeonStadelmann Date: Tue, 11 Jun 2024 15:40:52 +0200 Subject: [PATCH 1/6] Add compute variance --- src/moscot/base/problems/_mixins.py | 88 +++++++++++++++++++++++++++ tests/problems/generic/test_mixins.py | 35 +++++++++++ 2 files changed, 123 insertions(+) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 0cbdcb6cd..3a78ef239 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -720,3 +720,91 @@ def compute_entropy( if key_added is not None: self.adata.obs[key_added] = df return df if key_added is None else None + + def compute_variance( + self: AnalysisMixinProtocol[K, B], + source: K, + target: K, + forward: bool = True, + latent_space_selection: Union[str, list[str]] = "X_pca", + key_added: Optional[str] = "conditional_variance", + batch_size: Optional[int] = None, + ) -> Optional[pd.DataFrame]: + """Compute the conditional variance per cell. + + The conditional variance reflects the uncertainty of the mapping of a single cell by taking into account + a given latent space representation of all cells. + + Parameters + ---------- + source + Source key. + target + Target key. + forward + If `True`, computes the conditional variance given a cell in the source distribution, else the + conditional variance given a cell in the target distribution. + latent_space_selection: + Key or Keys which specifies the latent or feature space used for computing the conditional variance. + A single key can be a latent space in `~anndata.AnnData.obsm` or a gene in `~anndata.AnnData.var_names`, + a set of keys has to be a subset of genes in `~anndata.AnnData.var_names`. + key_added + Key in :attr:`~anndata.AnnData.obs` where the variance is stored. + batch_size + Batch size for the computation of the variance. If :obj:`None`, the entire dataset is used. + + Returns + ------- + :obj:`None` if ``key_added`` is not None. Otherwise, returns a data frame of shape ``(n_cells, 1)`` containing + the conditional variance given each cell. + """ + filter_value = source if forward else target + opposite_filter_value = target if forward else source + + if type(latent_space_selection) == str: + if latent_space_selection in self.adata.obsm: + latent_space = self.adata.obsm[latent_space_selection] + elif latent_space_selection in self.adata.var_names: + latent_space = self.adata[:, latent_space_selection in self.adata.var_names].X.toarray() + else: + raise KeyError("Gene/Latent space not found.") + elif type(latent_space_selection) in [list, np.ndarray]: + mask = [True if var_name in latent_space_selection else False for var_name in self.adata.var_names] + latent_space = self.adata[:, mask].X.toarray() + else: + raise KeyError("Unknown latent space selection.") + + latent_space_filtered = latent_space[np.array(self.adata.obs[self._policy.key] == opposite_filter_value), :] + + df = pd.DataFrame( + index=self.adata[self.adata.obs[self._policy.key] == filter_value, :].obs_names, + columns=[key_added] if key_added is not None else ["variance"], + ) + + batch_size = batch_size if batch_size is not None else len(df) + func = self.push if forward else self.pull + for batch in range(0, len(df), batch_size): + cond_dists = func( + source=source, + target=target, + data=None, + subset=(batch, batch_size), + normalize=True, + return_all=False, + scale_by_marginals=False, + split_mass=True, + key_added=None, + ) + + cond_var = [] + for i in range(cond_dists.shape[1]): + expected_val = (cond_dists[:,i]).reshape(-1,1) * latent_space_filtered + cond_var.append(np.linalg.norm((latent_space_filtered - expected_val), axis=1)**2 @ cond_dists[:,i]) + + df.iloc[range(batch, min(batch + batch_size, len(df))), 0] = np.array(cond_var) + + + if key_added is not None: + self.adata.obs[key_added] = df + return df if key_added is None else None + diff --git a/tests/problems/generic/test_mixins.py b/tests/problems/generic/test_mixins.py index 85c702c63..10928058c 100644 --- a/tests/problems/generic/test_mixins.py +++ b/tests/problems/generic/test_mixins.py @@ -336,6 +336,41 @@ def test_compute_entropy_regression(self, adata_time: AnnData, forward: bool, ba np.array(moscot_out, dtype=float), np.array(gt_out, dtype=float), rtol=RTOL, atol=ATOL ) + @pytest.mark.parametrize("forward", [True, False]) + @pytest.mark.parametrize("key_added", [None, "test"]) + @pytest.mark.parametrize("batch_size", [None, 2]) + @pytest.mark.parametrize("latent_space_selection", ["X_pca", "KLF12", ["KLF12", "Dlip3", "Dref"]]) + def test_compute_variance_pipeline( + self, adata_time: AnnData, forward: bool, latent_space_selection, key_added: Optional[str], batch_size: int + ): + rng = np.random.RandomState(42) + adata_time = adata_time[adata_time.obs["time"].isin((0, 1))].copy() + n0 = adata_time[adata_time.obs["time"] == 0].n_obs + n1 = adata_time[adata_time.obs["time"] == 1].n_obs + + tmap = rng.uniform(1e-6, 1, size=(n0, n1)) + tmap /= tmap.sum().sum() + problem = CompoundProblemWithMixin(adata_time) + problem = problem.prepare(key="time", xy_callback="local-pca", policy="sequential") + problem[0, 1]._solution = MockSolverOutput(tmap) + + out = problem.compute_variance( + source=0, target=1, forward=forward, key_added=key_added, latent_space_selection=latent_space_selection, batch_size=batch_size + ) + if key_added is None: + assert isinstance(out, pd.DataFrame) + assert len(out) == n0 + else: + assert out is None + assert key_added in adata_time.obs + assert np.sum(adata_time[adata_time.obs["time"] == int(1 - forward)].obs[key_added].isna()) == 0 + assert ( + np.sum(adata_time[adata_time.obs["time"] == int(forward)].obs[key_added].isna()) == n1 + if forward + else n0 + ) + + def test_seed_reproducible(self, adata_time: AnnData): key_added = "test" rng = np.random.RandomState(42) From 5baf9f649919e1e502cd71b29e99a74adfd2b8b9 Mon Sep 17 00:00:00 2001 From: LeonStadelmann Date: Tue, 11 Jun 2024 16:11:33 +0200 Subject: [PATCH 2/6] Add conditional variance --- src/moscot/base/problems/_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 3a78ef239..59c5a9a37 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -746,8 +746,8 @@ def compute_variance( conditional variance given a cell in the target distribution. latent_space_selection: Key or Keys which specifies the latent or feature space used for computing the conditional variance. - A single key can be a latent space in `~anndata.AnnData.obsm` or a gene in `~anndata.AnnData.var_names`, - a set of keys has to be a subset of genes in `~anndata.AnnData.var_names`. + A single key has to be a latent space in :attr:`~anndata.AnnData.obsm` or a gene in :attr:`~anndata.AnnData.var_names`, + a set of keys has to be a subset of genes in :attr:`~anndata.AnnData.var_names`. key_added Key in :attr:`~anndata.AnnData.obs` where the variance is stored. batch_size From fa72e236b7aaf3eba35ca5053be0a1809a99b9cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:18:38 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/base/problems/_mixins.py | 10 ++++------ tests/problems/generic/test_mixins.py | 8 ++++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 59c5a9a37..06c97703b 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -720,7 +720,7 @@ def compute_entropy( if key_added is not None: self.adata.obs[key_added] = df return df if key_added is None else None - + def compute_variance( self: AnalysisMixinProtocol[K, B], source: K, @@ -732,7 +732,7 @@ def compute_variance( ) -> Optional[pd.DataFrame]: """Compute the conditional variance per cell. - The conditional variance reflects the uncertainty of the mapping of a single cell by taking into account + The conditional variance reflects the uncertainty of the mapping of a single cell by taking into account a given latent space representation of all cells. Parameters @@ -798,13 +798,11 @@ def compute_variance( cond_var = [] for i in range(cond_dists.shape[1]): - expected_val = (cond_dists[:,i]).reshape(-1,1) * latent_space_filtered - cond_var.append(np.linalg.norm((latent_space_filtered - expected_val), axis=1)**2 @ cond_dists[:,i]) + expected_val = (cond_dists[:, i]).reshape(-1, 1) * latent_space_filtered + cond_var.append(np.linalg.norm((latent_space_filtered - expected_val), axis=1) ** 2 @ cond_dists[:, i]) df.iloc[range(batch, min(batch + batch_size, len(df))), 0] = np.array(cond_var) - if key_added is not None: self.adata.obs[key_added] = df return df if key_added is None else None - diff --git a/tests/problems/generic/test_mixins.py b/tests/problems/generic/test_mixins.py index 10928058c..afab3030b 100644 --- a/tests/problems/generic/test_mixins.py +++ b/tests/problems/generic/test_mixins.py @@ -355,7 +355,12 @@ def test_compute_variance_pipeline( problem[0, 1]._solution = MockSolverOutput(tmap) out = problem.compute_variance( - source=0, target=1, forward=forward, key_added=key_added, latent_space_selection=latent_space_selection, batch_size=batch_size + source=0, + target=1, + forward=forward, + key_added=key_added, + latent_space_selection=latent_space_selection, + batch_size=batch_size, ) if key_added is None: assert isinstance(out, pd.DataFrame) @@ -370,7 +375,6 @@ def test_compute_variance_pipeline( else n0 ) - def test_seed_reproducible(self, adata_time: AnnData): key_added = "test" rng = np.random.RandomState(42) From 5a392f565de8f74aab06841ffa4b9bf9542fb40c Mon Sep 17 00:00:00 2001 From: Leon Stadelmann Date: Tue, 11 Jun 2024 18:41:03 +0200 Subject: [PATCH 4/6] Fix linting --- src/moscot/base/problems/_mixins.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 06c97703b..212db36e9 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -746,8 +746,9 @@ def compute_variance( conditional variance given a cell in the target distribution. latent_space_selection: Key or Keys which specifies the latent or feature space used for computing the conditional variance. - A single key has to be a latent space in :attr:`~anndata.AnnData.obsm` or a gene in :attr:`~anndata.AnnData.var_names`, - a set of keys has to be a subset of genes in :attr:`~anndata.AnnData.var_names`. + A single key has to be a latent space in :attr:`~anndata.AnnData.obsm` or + a gene in :attr:`~anndata.AnnData.var_names`. + A set of keys has to be a subset of genes in :attr:`~anndata.AnnData.var_names`. key_added Key in :attr:`~anndata.AnnData.obs` where the variance is stored. batch_size @@ -761,7 +762,7 @@ def compute_variance( filter_value = source if forward else target opposite_filter_value = target if forward else source - if type(latent_space_selection) == str: + if isinstance(latent_space_selection, str): if latent_space_selection in self.adata.obsm: latent_space = self.adata.obsm[latent_space_selection] elif latent_space_selection in self.adata.var_names: @@ -769,7 +770,7 @@ def compute_variance( else: raise KeyError("Gene/Latent space not found.") elif type(latent_space_selection) in [list, np.ndarray]: - mask = [True if var_name in latent_space_selection else False for var_name in self.adata.var_names] + mask = [var_name in latent_space_selection for var_name in self.adata.var_names] latent_space = self.adata[:, mask].X.toarray() else: raise KeyError("Unknown latent space selection.") From 3aebd31a7570fe3c7412f53f72c16cd6d3c6967d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 16:41:45 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/base/problems/_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 212db36e9..4eefc22e3 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -746,7 +746,7 @@ def compute_variance( conditional variance given a cell in the target distribution. latent_space_selection: Key or Keys which specifies the latent or feature space used for computing the conditional variance. - A single key has to be a latent space in :attr:`~anndata.AnnData.obsm` or + A single key has to be a latent space in :attr:`~anndata.AnnData.obsm` or a gene in :attr:`~anndata.AnnData.var_names`. A set of keys has to be a subset of genes in :attr:`~anndata.AnnData.var_names`. key_added From b74fe9efb5074da111f511b02f285280bf1ec26d Mon Sep 17 00:00:00 2001 From: LeonStadelmann Date: Thu, 13 Jun 2024 15:50:50 +0200 Subject: [PATCH 6/6] fix linting 2 --- src/moscot/base/problems/_mixins.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/moscot/base/problems/_mixins.py b/src/moscot/base/problems/_mixins.py index 4eefc22e3..28bf5f093 100644 --- a/src/moscot/base/problems/_mixins.py +++ b/src/moscot/base/problems/_mixins.py @@ -798,9 +798,9 @@ def compute_variance( ) cond_var = [] - for i in range(cond_dists.shape[1]): - expected_val = (cond_dists[:, i]).reshape(-1, 1) * latent_space_filtered - cond_var.append(np.linalg.norm((latent_space_filtered - expected_val), axis=1) ** 2 @ cond_dists[:, i]) + for i in range(cond_dists.shape[1]): # type: ignore[union-attr] + expected_val = (cond_dists[:, i]).reshape(-1, 1) * latent_space_filtered # type: ignore[index] + cond_var.append(np.linalg.norm((latent_space_filtered - expected_val), axis=1) ** 2 @ cond_dists[:, i]) # type: ignore[index] df.iloc[range(batch, min(batch + batch_size, len(df))), 0] = np.array(cond_var)