Skip to content

Commit

Permalink
Merge pull request #222 from SpatialHackathon/main
Browse files Browse the repository at this point in the history
Sync w main
  • Loading branch information
Jieran-S authored Feb 22, 2024
2 parents 6b296b0 + 9b76897 commit 71f466d
Show file tree
Hide file tree
Showing 23 changed files with 239 additions and 36 deletions.
159 changes: 159 additions & 0 deletions data/STARmap_plus/STARmap_plus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/env python

# Author_and_contribution: Niklas Mueller-Boetticher; created template
# Author_and_contribution: Florian Heyl (heylf); created code

import argparse
import os
import tempfile
import requests
import pandas as pd
import scipy
import json
import pandas as pd
import numpy as np

from scipy.io import mmwrite
from pathlib import Path

def download_data(url, destination_folder, file_name):
print(f'[INFO] Downloading annotated data from {url} and put it into {destination_folder}...')

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)

# Get the file name from the URL
file_name = os.path.join(destination_folder, file_name)

# Download the file
response = requests.get(url)
with open(file_name, 'wb') as file:
file.write(response.content)

print('...done')

#def get_data(out_dir):
def get_data(out_dir):

with tempfile.TemporaryDirectory() as tmpdir:
print('[INFO] created temporary directory', tmpdir)
print('[START] COMPOSING DATA')

# Names and urls of the samples are so inconsistent that I have to list them manually
samples = ['well7_5','well10','well09','well04','well06','well07','well03','well01OB',
'well05','sagittal3','well01brain','well1_5','well2_5','sagittal1','spinalcord',
'well11','sagittal2','well3_5','well08']

n_cluster = []
directories = []

for sample in samples:
print(f'[INFO] Get sample {sample}')

if not os.path.exists(f'{out_dir}/{sample}'):
os.makedirs(f'{out_dir}/{sample}')

download_data(f'https://zenodo.org/records/8327576/files/{sample}_spatial.csv?download=1',
f'{tmpdir}', f'{sample}_spatial.csv')

sample_dir = f'{out_dir}/{sample}'
directories.append(sample_dir)

# Write out coordinates.tsv, labels.tsv and observations.tsv
with open(f'{tmpdir}/{sample}_spatial.csv', 'r') as f_in, \
open(f'{sample_dir}/labels.tsv', 'w') as f_out_labels,\
open(f'{sample_dir}/coordinates.tsv', 'w') as f_out_coords, \
open(f'{sample_dir}/observations.tsv', 'w') as f_out_obs :

# skip first two lines
headline_1 = f_in.readline()
headline_2 = f_in.readline()

f_out_obs.write(headline_1.replace(',','\t').replace('NAME', ''))

clusters = []

f_out_coords.write('\tx\ty\tz\n')
f_out_labels.write('\tMain_molecular_cell_type\tSub_molecular_cell_type\tMain_molecular_tissue_region\
\tSub_molecular_tissue_region\tMolecular_spatial_cell_type\n')

for l in f_in:
data = l.strip('\n').split(",")
f_out_labels.write(f'{data[0]}\t{data[4]}\t{data[5]}\t{data[6]}\t{data[7]}\t{data[8]}\n')
f_out_coords.write(f'{data[0]}\t{data[1]}\t{data[2]}\t{data[3]}\n')
f_out_obs.write(l.replace(',','\t'))
clusters.append(data[4])

n_cluster.append(len(set(clusters)))

download_data(f'https://zenodo.org/records/8327576/files/{sample}raw_expression_pd.csv?download=1',
f'{tmpdir}', f'{sample}raw_expression_pd.csv')

# Write counts.mtx
df = pd.read_table(f'{tmpdir}/{sample}raw_expression_pd.csv', sep=',', index_col=0)
features = df.index
df = df.transpose()
mmwrite(f'{sample_dir}/counts.mtx', scipy.sparse.csr_matrix(df))

# Write out features.tsv
with open(f'{sample_dir}/features.tsv', 'w') as f_out_features :
f_out_features.write('\tgene_version\n')
for feature in features:
f_out_features.write(f'{feature}\tNA\n')

## Metadata files
download_data(f'https://zenodo.org/records/8327576/files/metadata.csv?download=1',
f'{tmpdir}', 'metadata.csv')

tmp_samples = []
position = []
patient = []

with open(f'{tmpdir}/metadata.csv', 'r') as f_in:
f_in.readline()
f_in.readline()
for l in f_in:
data = l.strip('\n').split(",")
if ( "_".join(data[0].split("_")[:-1]) not in tmp_samples):
tmp_samples.append("_".join(data[0].split("_")[:-1]))
position.append(data[8])
patient.append(data[2])

sort_metadata = [tmp_samples.index(x) for x in samples]
position = np.array(position)[sort_metadata]
patient = np.array(patient)[sort_metadata]

samples_df = pd.DataFrame({'patient': patient, 'sample': samples, 'position': position,
'replicate': ['NA']*len(samples), 'directory': directories, 'n_clusters': n_cluster})
samples_df.loc[
:, ["patient", "sample", "position", "replicate", "directory", "n_clusters"]
].to_csv(f'{out_dir}/samples.tsv', sep="\t", index_label="")

with open(f"{out_dir}/experiment.json", "w") as f:
exp_info = {"technology": 'STARmap+'}
json.dump(exp_info, f)

print('[FINISH]')

def main():
# Set up command-line argument parser
parser = argparse.ArgumentParser(description="Load data for STARmap+ dataset. This dataset contains spatial gene \
expression profiles of 1,022 genes mapped in 3D at a voxel size of \
194 X 194 X 345 nm3 in 1.09 million high-quality cells in the mouse CNS.")

# Add arguments for input and output folders
parser.add_argument('-o','--out_dir', help="Output directory to write files to.",required=True)

# Parse the command-line arguments
args = parser.parse_args()
print(args)
print(args.out_dir)
get_data(args.out_dir)

if __name__ == '__main__':
main()




9 changes: 9 additions & 0 deletions data/STARmap_plus/STARmap_plus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
channels:
- conda-forge
dependencies:
- gdown=5.1.0
- pandas=2.2.0
- requests=2.31.0
- numpy=1.26.4
- python=3.12.2
- scipy=1.12.0
10 changes: 8 additions & 2 deletions method/BANKSY/banksy.r
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ get_SpatialExperiment <- function(

if (!is.na(matrix_file)) {
assay(spe, assay_name, withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
assay(spe, "logcounts", withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
#assay(spe, "logcounts", withDimnames = FALSE) <- as(Matrix::t(Matrix::readMM(matrix_file)), "CsparseMatrix")
}

# Filter features and samples
Expand Down Expand Up @@ -168,8 +168,14 @@ k_geom <- config$k_geom
npcs <- config$npcs
resolution <- config$resolution
method <- config$method
assay_name <- "logcounts"
assay_name <- "normcounts"
set.seed(seed)

# Normalization to mean library size
spe <- computeLibraryFactors(spe)
assay(spe, assay_name) <- normalizeCounts(spe, log = FALSE)

# Run BANKSY
spe <- Banksy::computeBanksy(spe, assay_name = assay_name, k_geom = k_geom)
spe <- Banksy::runBanksyPCA(spe, lambda = lambda, npcs = npcs)
spe <- Banksy::clusterBanksy(spe, lambda = lambda, use_pcs = TRUE, npcs = npcs, resolution = resolution, seed = seed, method = method)
Expand Down
2 changes: 1 addition & 1 deletion method/BANKSY/banksy_optargs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"matrix": "transform",
"matrix": "counts",
"integrated_feature_selection": false,
"image": false,
"neighbors": false,
Expand Down
10 changes: 8 additions & 2 deletions method/BayesSpace/BayesSpace.r
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,16 @@ embedding_file <- file.path(out_dir, "embedding.tsv")

# Use these filepaths as input ...
coord_file <- opt$coordinates
matrix_file <- opt$matrix
feature_file <- opt$features
observation_file <- opt$observations
neighbors_file <- opt$neighbors

if (!is.na(opt$neighbors)) {
neighbors_file <- opt$neighbors
}

if (!is.na(opt$matrix)) {
matrix_file <- opt$matrix
}

if (!is.na(opt$dim_red)) {
dimred_file <- opt$dim_red
Expand Down
2 changes: 1 addition & 1 deletion method/GraphST/GraphST_optargs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"matrix": "dimensionality_reduction",
"matrix": "transform",
"integrated_feature_selection": true,
"image": false,
"neighbors": true,
Expand Down
3 changes: 3 additions & 0 deletions method/GraphST/method_GraphST.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ def get_anndata(args):
adata = get_anndata(args)
adata.var_names_make_unique()

# Assuming transform=log1p, here's the scaling step: https://github.com/JinmiaoChenLab/GraphST/blob/main/GraphST/preprocess.py
sc.pp.scale(adata, zero_center=False, max_value=10)

# Set seed
random.seed(seed)
torch.manual_seed(seed)
Expand Down
3 changes: 3 additions & 0 deletions method/SCAN-IT/method_scanit.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def get_anndata(args):
use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'

# Assuming transform=log1p, here's the scaling step: https://github.com/zcang/SCAN-IT/blob/main/examples/Slide-seq/scanit.ipynb
sc.pp.scale(adata)

# Construct the spatial graph
scanit.tl.spatial_graph(adata, method='alpha shape', alpha_n_layer=2, knn_n_neighbors=5)

Expand Down
2 changes: 1 addition & 1 deletion method/SOTIP/method_sotip.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def get_anndata(args):
adata.obsp['ME_EMD_mat'] = adata_phEMD.obsm['X_ME_EMD_mat']

# Compute the MEG, each node is a ME, edge is the connectivity
sc.pp.neighbors(adata, n_neighbors=n_neighbors)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, use_rep="reduced_dimensions")
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(adata_phEMD.obsm['X_ME_EMD_mat'], n_neighbors=n_neighbors, metric='precomputed')
adata.obsp['distances'], adata.obsp['connectivities'] = sc.neighbors._compute_connectivities_umap(
knn_indices,
Expand Down
4 changes: 2 additions & 2 deletions method/SpaceFlow/method_spaceflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def get_anndata(args):
sf.spatial_graph = spatial_graph

# Train the network
sf.train(spatial_regularization_strength=0.1, z_dim=50, lr=1e-3, epochs=1000, max_patience=50, min_stop=100, random_seed=seed, gpu=device, regularization_acceleration=True, edge_subset_sz=1000000, embedding_save_filepath=out_dir)
sf.train(spatial_regularization_strength=0.1, z_dim=50, lr=1e-3, epochs=1000, max_patience=50, min_stop=100, random_seed=seed, gpu=device, regularization_acceleration=True, edge_subset_sz=1000000, embedding_save_filepath=embedding_file)

if config is not None:
res = int(config['res'])
Expand All @@ -158,7 +158,7 @@ def get_anndata(args):
warnings.warn("The `n_clusters` parameter was not used; config['res'] used instead.")

# Segment the domains given the resolution
sf.segmentation(domain_label_save_filepath=out_dir, n_neighbors=nn, resolution=res)
sf.segmentation(domain_label_save_filepath=label_file, n_neighbors=nn, resolution=res)

label_df = pd.DataFrame(sf.domains) # DataFrame with index (cell-id/barcode) and 1 column (label)
embedding_df = pd.DataFrame(sf.embedding, index=adata.obs_names) # DataFrame with index (cell-id/barcode) and n columns
Expand Down
2 changes: 1 addition & 1 deletion method/SpaceFlow/spaceflow_optargs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"matrix": "dimensionality_reduction",
"matrix": "transform",
"integrated_feature_selection": false,
"image": false,
"neighbors": false,
Expand Down
21 changes: 15 additions & 6 deletions method/SpiceMix/SpiceMix.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,37 @@
parser.add_argument(
"-o", "--observations", help="Path to observations (as tsv).", required=True
)

parser.add_argument(
"-n",
"--neighbors",
help="Path to neighbor definitions. Square matrix (not necessarily symmetric) where each row contains the neighbors of this observation (as mtx).",
required=False,
)

parser.add_argument("-d", "--out_dir", help="Output directory.", required=True)

parser.add_argument(
"--dim_red",
help="Reduced dimensionality representation (e.g. PCA).",
required=False,
)
parser.add_argument("--image", help="Path to H&E staining.", required=False)
parser.add_argument(
"--n_clusters", help="Number of clusters to return.", required=True, type=int
)
parser.add_argument(
"--technology",
help="The technology of the dataset (Visium, ST, imaging-based).",
required=True,
)
parser.add_argument(
"--seed", help="Seed to use for random operations.", required=True, type=int
)
parser.add_argument(
"--config",
help="Config file used to pass additional parameters.",
required=True,
help="Optional config file (json) used to pass additional parameters.",
required=False,
)


args = parser.parse_args()

from pathlib import Path
Expand Down Expand Up @@ -141,7 +150,7 @@ def get_anndata(args):

del adata.obsp["spatial_connectivities"]

adata = PopariDataset(adata, "raw")
adata = PopariDataset(adata, "processed")

return adata

Expand Down
6 changes: 3 additions & 3 deletions method/SpiceMix/SpiceMix_optargs.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"matrix": "counts",
"integrated_feature_selection": false,
"matrix": "transform",
"integrated_feature_selection": true,
"image": false,
"neighbors": false,
"neighbors": true,
"config_file": true
}
5 changes: 1 addition & 4 deletions method/SpiceMix/config/config_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,5 @@
"device": "cuda:0",
"dtype": "float64",
"num_preiterations": 5,
"num_iterations": 200,
"preprocess": {
"hvgs": 3500
}
"num_iterations": 200
}
Loading

0 comments on commit 71f466d

Please sign in to comment.