Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data ag kuppe #109

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions data/agkuppe_heart_kidney/agkuppe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
dependencies:
- python=3.11.6
- scipy=1.11.4
- anndata=0.10.3
- numpy=1.23.4
- pandas=2.1.3
123 changes: 123 additions & 0 deletions data/agkuppe_heart_kidney/setup_merfish.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that this does not download a public dataset but rather curates data that already needs to be present I am not sure if it makes sense to ad the script to the repo?
Don't get me wrong we can definitely still use the data!
@naveedishaque what is our take on this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure. I would say we keep it as it is required to take the data from the current source. IF the sources changes, then we don't need the script, but right now we do. Maybe tag the dataset as a "v1"?

Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os
import argparse
import shutil
import scipy
import anndata
import numpy as np
import pandas as pd
import json


BAD_GENES = ["eGFP","mCherry2","tdToma"]
META_DICT = {"technology":"Merfish"}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change to "MERFISH"

SAMPLE_INFO = {"patient":"1","sample":"1","position":"0","replicate":"1","n_clusters":"0","directory":f"os.path.basename(args.output)"}
LICENSE = """
This dataset was created by AG Kuppe at the University Hospital Aachen, Germany.

It may only be used in the context of the Spacehack 2023 event.

In case of any questions feel free to contact Paul Kiessling, [email protected].
"""


def copy_images(input_folder, output_folder):
# Ensure the output folder exists, create if not
os.makedirs(output_folder, exist_ok=True)

# Get a list of files in the input folder
files = os.listdir(input_folder)
files = [file for file in files if file.endswith(".tif")]
# Copy image
for file in files:
input_path = os.path.join(input_folder, file)
output_path = os.path.join(output_folder, file)
shutil.copy2(input_path, output_path)
print(f"Copied: {input_path} to {output_path}")


def load_into_anndata(input_folder):
data = pd.read_csv(input_folder + "/cell_by_gene.csv", index_col=0, dtype={"cell": str})
obs = pd.read_csv(input_folder + "/cell_metadata.csv", index_col=0, dtype={"EntityID": str})
is_gene = ~data.columns.str.lower().str.contains("blank")
adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs)
adata.obsm["blank"] = data.loc[:, ~is_gene]
adata = adata[:,~adata.var_names.isin(BAD_GENES)]
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values
adata.obs["EntityID"] = adata.obs.index
return adata

def convert_data(input_folder, output_folder,ct_file):
os.makedirs(output_folder, exist_ok=True)
adata = load_into_anndata(input_folder)

# Observations
obs = adata.obs.copy()
obs["selected"] = "true"
if ct_file != None:
print("adding ct")
ct = pd.read_table(ct_file, index_col=0)
ct.index = ct.index.astype("str")
obs["cell_type"] = ct["cell_type"]
obs['cell_type'].fillna('filtered', inplace=True)

adata.obs["cell_type"] = ct["cell_type"]
adata.obs["cell_type"].fillna('filtered', inplace=True)

obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="")

# Features
vars = adata.var.copy()
vars["selected"] = "true"

vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="")

# Coordinates
coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"])
coords.index = adata.obs.index
coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="")

# Matrix
scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X)

# Anndata
adata.write_h5ad(f"{output_folder}/anndata.h5ad")


def write_json(dict,output_path):
with open(output_path, 'w') as json_file:
json.dump(dict, json_file)




def main():
# Set up command-line argument parser
parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.")

# Add arguments for input and output folders
parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True)
parser.add_argument("--output", help="Path to the output folder",required=True)
parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False)


# Parse the command-line arguments
args = parser.parse_args()

# Call the function to copy files
convert_data(args.input, args.output,args.ct)
#copy_images(os.path.join(args.input_folder, "images"), args.output_folder)

# write json
write_json(META_DICT,os.path.join(os.path.dirname(args.output), "experiment.json"))

# write samples.tsv
sample_df = pd.DataFrame.from_dict(SAMPLE_INFO, orient='index').T
output_directory = os.path.dirname(args.output)
sample_df.to_csv(f"{output_directory}/samples.tsv", sep="\t", index_label=False)

# write LICENSE
with open(f"{os.path.dirname(args.output)}/LICENSE.md", 'w') as file:
file.write(LICENSE)

if __name__ == "__main__":
main()
140 changes: 140 additions & 0 deletions data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this file

Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env Rscript

# Author_and_contribution: Niklas Mueller-Boetticher; created script

suppressPackageStartupMessages(library(optparse))

option_list <- list(
make_option(
c("-o", "--out_dir"),
type = "character", default = NULL,
help = "Output directory to write files to."
)
)

description <- "Load data for LIBD DLPFC (http://research.libd.org/spatialLIBD/)."

opt_parser <- OptionParser(
usage = description,
option_list = option_list
)
opt <- parse_args(opt_parser)

out_dir <- opt$out_dir

## Your code goes here
suppressPackageStartupMessages(library(spatialLIBD))
suppressPackageStartupMessages(library(magrittr))

write_SpatialExperiment_to_folder <- function(
spe, path, obs_col, label_col = "label", assay_name = "counts") {
dir.create(path, showWarnings = FALSE, recursive = TRUE)

colData(spe)[label_col] %>%
as.data.frame() %>%
dplyr::filter(!is.na(!!as.symbol(label_col))) %>%
write.table(file.path(path, "labels.tsv"), sep = "\t", col.names = NA, quote = FALSE)

colData(spe)[obs_col] %>%
as.data.frame() %>%
write.table(file.path(path, "observations.tsv"), sep = "\t", col.names = NA, quote = FALSE)

rowData(spe) %>%
as.data.frame() %>%
write.table(file.path(path, "features.tsv"), sep = "\t", col.names = NA, quote = FALSE)

coords <- spatialCoords(spe)
mode(coords) <- "integer"
as.data.frame(coords) %>%
dplyr::rename(x = "pxl_col_in_fullres", y = "pxl_row_in_fullres") %>%
write.table(file.path(path, "coordinates.tsv"), sep = "\t", col.names = NA, quote = FALSE)

assay(spe, assay_name) %>%
t() %>%
Matrix::writeMM(file.path(path, "counts.mtx"))
}

spe <- fetch_data("spe")

keep_cols <- c("sample_id", "subject", "position", "replicate", "discard", "spatialLIBD", "array_row", "array_col")

colData(spe) <- colData(spe)[, keep_cols]
colnames(colData(spe))[colnames(colData(spe)) == "array_row"] <- "row"
colnames(colData(spe))[colnames(colData(spe)) == "array_col"] <- "col"
colnames(colData(spe))[colnames(colData(spe)) == "spatialLIBD"] <- "label"

keep_rows <- c("gene_version", "gene_name", "source", "gene_biotype")
rowData(spe) <- rowData(spe)[, keep_rows]

patients <- unique(colData(spe)$subject)
for (patient in patients) {
patient_spe <- spe[, spe$subject == patient]
samples <- unique(colData(patient_spe)$sample_id)
for (sample in samples) {
spe_sample <- patient_spe[, patient_spe$sample_id == sample]
colData(spe_sample) <- colData(spe_sample)[, c("label", "row", "col")] # "discard"
write_SpatialExperiment_to_folder(
spe_sample,
file.path(out_dir, paste(patient, sample, sep = "_")),
obs_col = c("row", "col")
)
}
}

sample2patient <- colData(spe)[, c("sample_id", "subject")] %>%
as.data.frame() %>%
dplyr::distinct() %>%
tibble::deframe()

img_links <- c(
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151507_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151508_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151509_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151510_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151669_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151670_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151671_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151672_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151673_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151674_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151675_full_image.tif",
"https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151676_full_image.tif"
)

img_links <- tibble::as_tibble(list("link" = img_links)) %>%
dplyr::mutate(
sample = stringr::str_extract(link, "([^/]+)_full_image.tif$", group = 1),
patient = sample2patient[sample],
filename = "H_E.tiff"
)

options(timeout = 60 * 60)
purrr::pwalk(img_links, function(link, sample, patient, filename) {
download.file(
link,
file.path(out_dir, paste(patient, sample, sep = "_"), filename),
"wget",
quiet = TRUE
)
})

purrr::pwalk(img_links, function(link, sample, patient, filename) {
json <- file(file.path(out_dir, paste(patient, sample, sep = "_"), "H_E.json"))
writeLines(c('{"scale": 1}'), json)
close(json)
})

colData(spe) %>%
as.data.frame() %>%
dplyr::select(patient = subject, sample = sample_id, position, replicate, label) %>%
dplyr::filter(!is.na(label)) %>%
dplyr::distinct() %>%
dplyr::count(patient, sample, position, replicate) %>%
dplyr::rename(n_clusters = n) %>%
dplyr::mutate(directory = paste(patient, sample, sep = "_")) %>%
`row.names<-`(NULL) %>%
write.table(file.path(out_dir, "samples.tsv"), sep = "\t", col.names = NA, quote = FALSE)

json <- file(file.path(out_dir, "experiment.json"))
writeLines(c('{"technology": "Visium"}'), json)
close(json)