Merge pull request #222 from SpatialHackathon/main

Sync w main
SpatialHackathon · Feb 22, 2024 · 71f466d · 71f466d
2 parents 6b296b0 + 9b76897
commit 71f466d
Show file tree

Hide file tree

Showing 23 changed files with 239 additions and 36 deletions.
diff --git a/data/STARmap_plus/STARmap_plus.py b/data/STARmap_plus/STARmap_plus.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+# Author_and_contribution: Niklas Mueller-Boetticher; created template
+# Author_and_contribution: Florian Heyl (heylf); created code
+
+import argparse
+import os
+import tempfile
+import requests
+import pandas as pd
+import scipy
+import json
+import pandas as pd
+import numpy as np
+
+from scipy.io import mmwrite
+from pathlib import Path
+
+def download_data(url, destination_folder, file_name):
+    print(f'[INFO] Downloading annotated data from {url} and put it into {destination_folder}...') 
+
+    # Create the destination folder if it doesn't exist
+    if not os.path.exists(destination_folder):
+        os.makedirs(destination_folder)
+
+    # Get the file name from the URL
+    file_name = os.path.join(destination_folder, file_name)
+
+    # Download the file
+    response = requests.get(url)
+    with open(file_name, 'wb') as file:
+        file.write(response.content)
+
+    print('...done')
+
+#def get_data(out_dir):
+def get_data(out_dir):
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print('[INFO] created temporary directory', tmpdir)
+        print('[START] COMPOSING DATA')
+
+        # Names and urls of the samples are so inconsistent that I have to list them manually
+        samples = ['well7_5','well10','well09','well04','well06','well07','well03','well01OB',
+                'well05','sagittal3','well01brain','well1_5','well2_5','sagittal1','spinalcord',
+                'well11','sagittal2','well3_5','well08']
+
+        n_cluster = []
+        directories = []
+
+        for sample in samples:
+            print(f'[INFO] Get sample {sample}')
+
+            if not os.path.exists(f'{out_dir}/{sample}'):
+                os.makedirs(f'{out_dir}/{sample}')
+
+            download_data(f'https://zenodo.org/records/8327576/files/{sample}_spatial.csv?download=1', 
+                        f'{tmpdir}', f'{sample}_spatial.csv')
+
+            sample_dir = f'{out_dir}/{sample}'
+            directories.append(sample_dir)
+
+            # Write out coordinates.tsv, labels.tsv and observations.tsv
+            with open(f'{tmpdir}/{sample}_spatial.csv', 'r') as f_in, \
+                open(f'{sample_dir}/labels.tsv', 'w') as f_out_labels,\
+                open(f'{sample_dir}/coordinates.tsv', 'w') as f_out_coords, \
+                open(f'{sample_dir}/observations.tsv', 'w') as f_out_obs :
+
+                # skip first two lines
+                headline_1 = f_in.readline()
+                headline_2 = f_in.readline()
+
+                f_out_obs.write(headline_1.replace(',','\t').replace('NAME', ''))
+
+                clusters = []
+
+                f_out_coords.write('\tx\ty\tz\n')
+                f_out_labels.write('\tMain_molecular_cell_type\tSub_molecular_cell_type\tMain_molecular_tissue_region\
+                                \tSub_molecular_tissue_region\tMolecular_spatial_cell_type\n')
+
+                for l in f_in:
+                    data = l.strip('\n').split(",")
+                    f_out_labels.write(f'{data[0]}\t{data[4]}\t{data[5]}\t{data[6]}\t{data[7]}\t{data[8]}\n')
+                    f_out_coords.write(f'{data[0]}\t{data[1]}\t{data[2]}\t{data[3]}\n')
+                    f_out_obs.write(l.replace(',','\t'))
+                    clusters.append(data[4])
+
+            n_cluster.append(len(set(clusters)))
+
+            download_data(f'https://zenodo.org/records/8327576/files/{sample}raw_expression_pd.csv?download=1', 
+                        f'{tmpdir}', f'{sample}raw_expression_pd.csv')
+
+            # Write counts.mtx
+            df = pd.read_table(f'{tmpdir}/{sample}raw_expression_pd.csv', sep=',', index_col=0)
+            features = df.index 
+            df = df.transpose()
+            mmwrite(f'{sample_dir}/counts.mtx', scipy.sparse.csr_matrix(df))
+
+            # Write out features.tsv
+            with open(f'{sample_dir}/features.tsv', 'w') as f_out_features :
+                f_out_features.write('\tgene_version\n')
+                for feature in features:
+                    f_out_features.write(f'{feature}\tNA\n')
+
+        ## Metadata files
+        download_data(f'https://zenodo.org/records/8327576/files/metadata.csv?download=1', 
+                    f'{tmpdir}', 'metadata.csv')  
+
+        tmp_samples = []
+        position = []
+        patient = []
+
+        with open(f'{tmpdir}/metadata.csv', 'r') as f_in:
+            f_in.readline()
+            f_in.readline()
+            for l in f_in:
+                data = l.strip('\n').split(",")
+                if ( "_".join(data[0].split("_")[:-1]) not in tmp_samples):
+                    tmp_samples.append("_".join(data[0].split("_")[:-1]))
+                    position.append(data[8])
+                    patient.append(data[2])
+
+        sort_metadata = [tmp_samples.index(x) for x in samples]
+        position = np.array(position)[sort_metadata]
+        patient = np.array(patient)[sort_metadata]
+
+        samples_df = pd.DataFrame({'patient': patient, 'sample': samples, 'position': position, 
+                                'replicate': ['NA']*len(samples), 'directory': directories, 'n_clusters': n_cluster})
+        samples_df.loc[
+            :, ["patient", "sample", "position", "replicate", "directory", "n_clusters"]
+        ].to_csv(f'{out_dir}/samples.tsv', sep="\t", index_label="")
+
+        with open(f"{out_dir}/experiment.json", "w") as f:
+            exp_info = {"technology": 'STARmap+'}
+            json.dump(exp_info, f)
+
+        print('[FINISH]')        
+
+def main():
+    # Set up command-line argument parser
+    parser = argparse.ArgumentParser(description="Load data for STARmap+ dataset. This dataset contains spatial gene \
+                                 expression profiles of 1,022 genes mapped in 3D at a voxel size of \
+                                 194 X 194 X 345 nm3 in 1.09 million high-quality cells in the mouse CNS.")
+
+    # Add arguments for input and output folders
+    parser.add_argument('-o','--out_dir', help="Output directory to write files to.",required=True)
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+    print(args)
+    print(args.out_dir)
+    get_data(args.out_dir)
+
+if __name__ == '__main__':
+    main()
+
+
+
+
diff --git a/data/STARmap_plus/STARmap_plus.yml b/data/STARmap_plus/STARmap_plus.yml
@@ -0,0 +1,9 @@
+channels:
+  - conda-forge
+dependencies:
+  - gdown=5.1.0
+  - pandas=2.2.0
+  - requests=2.31.0
+  - numpy=1.26.4
+  - python=3.12.2
+  - scipy=1.12.0
diff --git a/method/BANKSY/banksy.r b/method/BANKSY/banksy.r
@@ -135,7 +135,7 @@ get_SpatialExperiment <- function(
 
   if (!is.na(matrix_file)) {
     assay(spe, assay_name, withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
-    assay(spe, "logcounts", withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
+    #assay(spe, "logcounts", withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
   }
 
   # Filter features and samples
@@ -168,8 +168,14 @@ k_geom <- config$k_geom
 npcs <- config$npcs
 resolution <- config$resolution
 method <- config$method
-assay_name <- "logcounts"
+assay_name <- "normcounts"
 set.seed(seed)
+
+# Normalization to mean library size
+spe <- computeLibraryFactors(spe)
+assay(spe, assay_name) <- normalizeCounts(spe, log = FALSE)
+
+# Run BANKSY
 spe <- Banksy::computeBanksy(spe, assay_name = assay_name, k_geom = k_geom)
 spe <- Banksy::runBanksyPCA(spe, lambda = lambda, npcs = npcs)
 spe <- Banksy::clusterBanksy(spe, lambda = lambda, use_pcs = TRUE, npcs = npcs, resolution = resolution, seed = seed, method = method)

diff --git a/method/BANKSY/banksy_optargs.json b/method/BANKSY/banksy_optargs.json
@@ -1,5 +1,5 @@
 {
-    "matrix": "transform",
+    "matrix": "counts",
     "integrated_feature_selection": false,
     "image": false,
     "neighbors": false,

diff --git a/method/BayesSpace/BayesSpace.r b/method/BayesSpace/BayesSpace.r
@@ -84,10 +84,16 @@ embedding_file <- file.path(out_dir, "embedding.tsv")
 
 # Use these filepaths as input ...
 coord_file <- opt$coordinates
-matrix_file <- opt$matrix
 feature_file <- opt$features
 observation_file <- opt$observations
-neighbors_file <- opt$neighbors
+
+if (!is.na(opt$neighbors)) {
+  neighbors_file <- opt$neighbors
+}
+
+if (!is.na(opt$matrix)) {
+  matrix_file <- opt$matrix
+}
 
 if (!is.na(opt$dim_red)) {
   dimred_file <- opt$dim_red

diff --git a/method/GraphST/GraphST_optargs.json b/method/GraphST/GraphST_optargs.json
@@ -1,5 +1,5 @@
 {
-    "matrix": "dimensionality_reduction",
+    "matrix": "transform",
     "integrated_feature_selection": true,
     "image": false,
     "neighbors": true,

diff --git a/method/GraphST/method_GraphST.py b/method/GraphST/method_GraphST.py
@@ -156,6 +156,9 @@ def get_anndata(args):
 adata = get_anndata(args)
 adata.var_names_make_unique()
 
+# Assuming transform=log1p, here's the scaling step: https://github.com/JinmiaoChenLab/GraphST/blob/main/GraphST/preprocess.py
+sc.pp.scale(adata, zero_center=False, max_value=10)
+
 # Set seed
 random.seed(seed)
 torch.manual_seed(seed)

diff --git a/method/SCAN-IT/method_scanit.py b/method/SCAN-IT/method_scanit.py
@@ -131,6 +131,9 @@ def get_anndata(args):
 use_cuda = torch.cuda.is_available()
 device = 'cuda' if use_cuda else 'cpu'
 
+# Assuming transform=log1p, here's the scaling step: https://github.com/zcang/SCAN-IT/blob/main/examples/Slide-seq/scanit.ipynb
+sc.pp.scale(adata)
+
 # Construct the spatial graph
 scanit.tl.spatial_graph(adata, method='alpha shape', alpha_n_layer=2, knn_n_neighbors=5)
 

diff --git a/method/SOTIP/method_sotip.py b/method/SOTIP/method_sotip.py
@@ -171,7 +171,7 @@ def get_anndata(args):
 adata.obsp['ME_EMD_mat'] = adata_phEMD.obsm['X_ME_EMD_mat']
 
 # Compute the MEG, each node is a ME, edge is the connectivity
-sc.pp.neighbors(adata, n_neighbors=n_neighbors)
+sc.pp.neighbors(adata, n_neighbors=n_neighbors, use_rep="reduced_dimensions")
 knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(adata_phEMD.obsm['X_ME_EMD_mat'], n_neighbors=n_neighbors, metric='precomputed')
 adata.obsp['distances'], adata.obsp['connectivities'] = sc.neighbors._compute_connectivities_umap(
     knn_indices,

diff --git a/method/SpaceFlow/method_spaceflow.py b/method/SpaceFlow/method_spaceflow.py
@@ -145,7 +145,7 @@ def get_anndata(args):
 sf.spatial_graph = spatial_graph
 
 # Train the network
-sf.train(spatial_regularization_strength=0.1, z_dim=50, lr=1e-3, epochs=1000, max_patience=50, min_stop=100, random_seed=seed, gpu=device, regularization_acceleration=True, edge_subset_sz=1000000, embedding_save_filepath=out_dir)
+sf.train(spatial_regularization_strength=0.1, z_dim=50, lr=1e-3, epochs=1000, max_patience=50, min_stop=100, random_seed=seed, gpu=device, regularization_acceleration=True, edge_subset_sz=1000000, embedding_save_filepath=embedding_file)
 
 if config is not None:
     res = int(config['res'])
@@ -158,7 +158,7 @@ def get_anndata(args):
 warnings.warn("The `n_clusters` parameter was not used; config['res'] used instead.")
 
 # Segment the domains given the resolution
-sf.segmentation(domain_label_save_filepath=out_dir, n_neighbors=nn, resolution=res)
+sf.segmentation(domain_label_save_filepath=label_file, n_neighbors=nn, resolution=res)
 
 label_df = pd.DataFrame(sf.domains)  # DataFrame with index (cell-id/barcode) and 1 column (label)
 embedding_df = pd.DataFrame(sf.embedding, index=adata.obs_names) # DataFrame with index (cell-id/barcode) and n columns

diff --git a/method/SpaceFlow/spaceflow_optargs.json b/method/SpaceFlow/spaceflow_optargs.json
@@ -1,5 +1,5 @@
 {
-    "matrix": "dimensionality_reduction",
+    "matrix": "transform",
     "integrated_feature_selection": false,
     "image": false,
     "neighbors": false,

diff --git a/method/SpiceMix/SpiceMix.py b/method/SpiceMix/SpiceMix.py
@@ -22,28 +22,37 @@
 parser.add_argument(
     "-o", "--observations", help="Path to observations (as tsv).", required=True
 )
-
 parser.add_argument(
     "-n",
     "--neighbors",
     help="Path to neighbor definitions. Square matrix (not necessarily symmetric) where each row contains the neighbors of this observation (as mtx).",
     required=False,
 )
-
 parser.add_argument("-d", "--out_dir", help="Output directory.", required=True)
-
+parser.add_argument(
+    "--dim_red",
+    help="Reduced dimensionality representation (e.g. PCA).",
+    required=False,
+)
+parser.add_argument("--image", help="Path to H&E staining.", required=False)
 parser.add_argument(
     "--n_clusters", help="Number of clusters to return.", required=True, type=int
 )
+parser.add_argument(
+    "--technology",
+    help="The technology of the dataset (Visium, ST, imaging-based).",
+    required=True,
+)
 parser.add_argument(
     "--seed", help="Seed to use for random operations.", required=True, type=int
 )
 parser.add_argument(
     "--config",
-    help="Config file used to pass additional parameters.",
-    required=True,
+    help="Optional config file (json) used to pass additional parameters.",
+    required=False,
 )
 
+
 args = parser.parse_args()
 
 from pathlib import Path
@@ -141,7 +150,7 @@ def get_anndata(args):
 
         del adata.obsp["spatial_connectivities"]
 
-        adata = PopariDataset(adata, "raw")
+        adata = PopariDataset(adata, "processed")
 
     return adata
 

diff --git a/method/SpiceMix/SpiceMix_optargs.json b/method/SpiceMix/SpiceMix_optargs.json
@@ -1,7 +1,7 @@
 {
-    "matrix": "counts",
-    "integrated_feature_selection": false,
+    "matrix": "transform",
+    "integrated_feature_selection": true,
     "image": false,
-    "neighbors": false,
+    "neighbors": true,
     "config_file": true
 }
diff --git a/method/SpiceMix/config/config_1.json b/method/SpiceMix/config/config_1.json
@@ -4,8 +4,5 @@
     "device": "cuda:0",
     "dtype": "float64",
     "num_preiterations": 5,
-    "num_iterations": 200,
-    "preprocess": {
-        "hvgs": 3500
-    }
+    "num_iterations": 200
 }