From 3f7a160d8587a05f61e9c49409463fa9d19fc3d4 Mon Sep 17 00:00:00 2001
From: Jieran Sun <jiersun99@gmail.com>
Date: Mon, 26 Feb 2024 10:41:24 +0000
Subject: [PATCH] Binary search implemented for python methods

---
 method/STAGATE/method_STAGATE.py           | 16 ++++++++++++---
 method/SpaceFlow/method_spaceflow.py       | 20 +++++++++++++++---
 method/scanpy/method_scanpy.py             | 24 +++++++++++++++++-----
 method/{binary_search.py => search_res.py} | 13 ++++--------
 4 files changed, 53 insertions(+), 20 deletions(-)
 rename method/{binary_search.py => search_res.py} (90%)

diff --git a/method/STAGATE/method_STAGATE.py b/method/STAGATE/method_STAGATE.py
index 7105f304..2dac844d 100644
--- a/method/STAGATE/method_STAGATE.py
+++ b/method/STAGATE/method_STAGATE.py
@@ -78,13 +78,22 @@
 import numpy as np
 import pandas as pd
 import scanpy as sc
-import sys
 # Use tensorflow or pyG
 import tensorflow.compat.v1 as tf
 tf.compat.v1.disable_eager_execution()
 # the location of R (used for the mclust clustering)
 import scipy as sp
 
+# import res-n_clust tuning function
+import sys
+from pathlib import Path
+
+# Add the parent directory of the current file to sys.path
+method_dir = Path(__file__).resolve().parent.parent  # Navigate two levels up
+sys.path.append(str(method_dir))
+
+from search_res import binary_search
+
 def get_anndata(args):
     import anndata as ad
     X = sp.io.mmread(args.matrix)
@@ -152,8 +161,9 @@ def get_anndata(args):
     adata = sg.mclust_R(adata, used_obsm='STAGATE', num_cluster=n_clusters, random_seed=seed)
     label_df = adata.obs[["mclust"]]
 elif config["method"] == "louvain":
-    sc.tl.louvain(adata, resolution=config["res"])
-    label_df = adata.obs[["louvain"]]
+    label_df = binary_search(adata, n_clust_target=n_clusters, method="louvain", seed = seed)
+    #sc.tl.louvain(adata, resolution=config["res"])
+    #label_df = adata.obs[["louvain"]]
 
 ## Write output
 out_dir.mkdir(parents=True, exist_ok=True)
diff --git a/method/SpaceFlow/method_spaceflow.py b/method/SpaceFlow/method_spaceflow.py
index 2994d897..5d1a033b 100644
--- a/method/SpaceFlow/method_spaceflow.py
+++ b/method/SpaceFlow/method_spaceflow.py
@@ -125,11 +125,22 @@ def get_anndata(args):
 random.seed(seed)
 
 import scanpy as sc
+import anndata as ad
 from SpaceFlow import SpaceFlow
 import pandas as pd
 import warnings
 import torch
 
+# import res-n_clust tuning function
+import sys
+from pathlib import Path
+
+# Add the parent directory of the current file to sys.path
+method_dir = Path(__file__).resolve().parent.parent  # Navigate two levels up
+sys.path.append(str(method_dir))
+
+from search_res import binary_search
+
 use_cuda = torch.cuda.is_available()
 device = 1 if use_cuda else 0
 
@@ -155,12 +166,15 @@ def get_anndata(args):
     nn = 15
 
 # Raise a warning that clustering is based on resolution and not n_clusters
-warnings.warn("The `n_clusters` parameter was not used; config['res'] used instead.")
+# warnings.warn("The `n_clusters` parameter was not used; config['res'] used instead.")
 
 # Segment the domains given the resolution
-sf.segmentation(domain_label_save_filepath=label_file, n_neighbors=nn, resolution=res)
+embedding_adata = ad.AnnData(sf.embedding)
+sc.pp.neighbors(embedding_adata, n_neighbors=nn, use_rep="X")
+label_df = binary_search(embedding_adata, n_clust_target=n_clusters, method="leiden", seed = seed)
+# sf.segmentation(domain_label_save_filepath=label_file, n_neighbors=nn, resolution=res)
 
-label_df = pd.DataFrame(sf.domains)  # DataFrame with index (cell-id/barcode) and 1 column (label)
+# label_df = pd.DataFrame(sf.domains)  # DataFrame with index (cell-id/barcode) and 1 column (label)
 embedding_df = pd.DataFrame(sf.embedding, index=adata.obs_names) # DataFrame with index (cell-id/barcode) and n columns
 
 ## Write output
diff --git a/method/scanpy/method_scanpy.py b/method/scanpy/method_scanpy.py
index 2dbd0f03..390034d5 100644
--- a/method/scanpy/method_scanpy.py
+++ b/method/scanpy/method_scanpy.py
@@ -149,6 +149,16 @@ def get_anndata(args):
 import scanpy as sc
 import json
 
+# import res-n_clust tuning function
+import sys
+from pathlib import Path
+
+# Add the parent directory of the current file to sys.path
+method_dir = Path(__file__).resolve().parent.parent  # Navigate two levels up
+sys.path.append(str(method_dir))
+
+from search_res import binary_search
+
 # get the json config
 with open (args.config, "r") as c:
     config = json.load(c)
@@ -160,7 +170,7 @@ def get_anndata(args):
     # throw warning about not using the num_clusters as a parameter, because scanpy uses leiden or louvain and needs the resolution parameter, which is defined in the config.json. There exists a good function to perform an extensive search for the right resolution parameter to define the desired num_clusters, but we still have licensing issues. For more, see the SpaceHack2.0 GitHub issue #139
 import warnings
 
-warnings.warn("Scanpy uses leiden/louvain for clustering, which relies on the resolution parameter in the config file. The parameter num_clusters will be ignored.", UserWarning)
+# warnings.warn("Scanpy uses leiden/louvain for clustering, which relies on the resolution parameter in the config file. The parameter num_clusters will be ignored.", UserWarning)
 
 # scanpy starts here
 if not args.dim_red:
@@ -173,14 +183,18 @@ def get_anndata(args):
 
 #two options - leiden or loivain
 if config['clustering'] == "louvain":
-    sc.tl.louvain(adata, resolution=config["resolution"], random_state=seed, key_added='louvain')
+    label_df = binary_search(adata, n_clust_target=n_clusters, method="louvain", seed = seed)
+    # sc.tl.louvain(adata, resolution=config["resolution"], random_state=seed, key_added='louvain')
 elif config['clustering'] == "leiden":
-    sc.tl.leiden(adata, resolution=config["resolution"], random_state=seed)
+    label_df = binary_search(adata, n_clust_target=n_clusters, method="leiden", seed = seed)
+    # sc.tl.leiden(adata, resolution=config["resolution"], random_state=seed)
 else: 
     print("No clustering method defined or your method is not available, performing leiden")
-    sc.tl.leiden(adata, resolution=config["resolution"], random_state=seed)
+    label_df = binary_search(adata, n_clust_target=n_clusters, method="leiden", seed = seed)
+
+# sc.tl.leiden(adata, resolution=config["resolution"], random_state=seed)
 
-label_df = adata.obs[["leiden"]]
+# label_df = adata.obs[["leiden"]]
 
 
 # embedding_df = None  # optional, DataFrame with index (cell-id/barcode) and n columns
diff --git a/method/binary_search.py b/method/search_res.py
similarity index 90%
rename from method/binary_search.py
rename to method/search_res.py
index e4134628..06e01407 100644
--- a/method/binary_search.py
+++ b/method/search_res.py
@@ -8,7 +8,7 @@ def binary_search(
     tolerance = 1e-3,
     seed = 2023,
 ):
-"""
+    """
     Uses binary search to find the resolution parameter that results in the target number of clusters.
 
     Parameters
@@ -46,7 +46,7 @@ def binary_search(
     Returns
     ------------
     y: a pandas dataframe with one clumn denoting the clustering results from best resolution and with barcode as index.
-"""
+    """
     import scanpy as sc
     import numpy as np
     import warnings
@@ -54,13 +54,8 @@ def binary_search(
     y = None
 
     def do_clustering(res):
-        match method:
-            case "louvain":
-                sc.tl.louvain(adata, resolution=res, random_state=seed)
-                y = adata.obs[["louvain"]].astype(int)
-            case "leiden":
-                sc.tl.leiden(adata, resolution=res, random_state=seed)
-                y = adata.obs[["leiden"]].astype(int)
+        getattr(sc.tl, method)(adata, resolution=res, random_state=seed)
+        y = adata.obs[[method]].astype(int)
         n_clust = len(np.unique(y))
         return y, n_clust