SpatialHackathon · pakiessling · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/data/agkuppe_heart_kidney/agkuppe.yml b/data/agkuppe_heart_kidney/agkuppe.yml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+dependencies:
+  - python=3.11.6
+  - scipy=1.11.4
+  - anndata=0.10.3
+  - numpy=1.23.4
+  - pandas=2.1.3
diff --git a/data/agkuppe_heart_kidney/setup_merfish.py b/data/agkuppe_heart_kidney/setup_merfish.py
@@ -0,0 +1,123 @@
+import os
+import argparse
+import shutil
+import scipy
+import anndata 
+import numpy as np
+import pandas as pd
+import json
+
+
+BAD_GENES = ["eGFP","mCherry2","tdToma"]
+META_DICT = {"technology":"Merfish"}
+SAMPLE_INFO = {"patient":"1","sample":"1","position":"0","replicate":"1","n_clusters":"0","directory":f"os.path.basename(args.output)"}
+LICENSE = """
+This dataset was created by AG Kuppe at the University Hospital Aachen, Germany.
+
+It may only be used in the context of the Spacehack 2023 event.
+
+In case of any questions feel free to contact Paul Kiessling, [email protected].
+"""
+
+
+def copy_images(input_folder, output_folder):
+    # Ensure the output folder exists, create if not
+    os.makedirs(output_folder, exist_ok=True)
+
+    # Get a list of files in the input folder
+    files = os.listdir(input_folder)
+    files = [file for file in files if file.endswith(".tif")]
+    # Copy image
+    for file in files:
+        input_path = os.path.join(input_folder, file)
+        output_path = os.path.join(output_folder, file)
+        shutil.copy2(input_path, output_path)
+        print(f"Copied: {input_path} to {output_path}")
+
+
+def load_into_anndata(input_folder):
+    data = pd.read_csv(input_folder + "/cell_by_gene.csv", index_col=0, dtype={"cell": str})
+    obs = pd.read_csv(input_folder + "/cell_metadata.csv", index_col=0, dtype={"EntityID": str})
+    is_gene = ~data.columns.str.lower().str.contains("blank")
+    adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs)
+    adata.obsm["blank"] = data.loc[:, ~is_gene]
+    adata = adata[:,~adata.var_names.isin(BAD_GENES)]
+    adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values
+    adata.obs["EntityID"] = adata.obs.index
+    return adata
+
+def convert_data(input_folder, output_folder,ct_file):
+    os.makedirs(output_folder, exist_ok=True)
+    adata = load_into_anndata(input_folder)
+
+    # Observations
+    obs = adata.obs.copy()
+    obs["selected"] = "true"
+    if ct_file != None:
+        print("adding ct")
+        ct = pd.read_table(ct_file, index_col=0)
+        ct.index = ct.index.astype("str")
+        obs["cell_type"] = ct["cell_type"]
+        obs['cell_type'].fillna('filtered', inplace=True)
+
+        adata.obs["cell_type"] = ct["cell_type"]
+        adata.obs["cell_type"].fillna('filtered', inplace=True)
+
+    obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="")
+
+    # Features
+    vars = adata.var.copy()
+    vars["selected"] = "true"
+
+    vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="")
+
+    # Coordinates
+    coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"])
+    coords.index = adata.obs.index
+    coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="")
+
+    # Matrix
+    scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X)
+
+    # Anndata
+    adata.write_h5ad(f"{output_folder}/anndata.h5ad")
+
+
+def write_json(dict,output_path):
+    with open(output_path, 'w') as json_file:
+        json.dump(dict, json_file)
+
+
+
+
+def main():
+    # Set up command-line argument parser
+    parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.")
+
+    # Add arguments for input and output folders
+    parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True)
+    parser.add_argument("--output", help="Path to the output folder",required=True)
+    parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False)
+
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+
+    # Call the function to copy files
+    convert_data(args.input, args.output,args.ct)
+    #copy_images(os.path.join(args.input_folder, "images"), args.output_folder)
+
+    # write json
+    write_json(META_DICT,os.path.join(os.path.dirname(args.output), "experiment.json"))
+
+    # write samples.tsv
+    sample_df = pd.DataFrame.from_dict(SAMPLE_INFO, orient='index').T
+    output_directory = os.path.dirname(args.output)
+    sample_df.to_csv(f"{output_directory}/samples.tsv", sep="\t", index_label=False)
+
+    # write LICENSE
+    with open(f"{os.path.dirname(args.output)}/LICENSE.md", 'w') as file:
+        file.write(LICENSE)
+
+if __name__ == "__main__":
+    main()
diff --git a/data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r b/data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r
@@ -0,0 +1,140 @@
+#!/usr/bin/env Rscript
+
+# Author_and_contribution: Niklas Mueller-Boetticher; created script
+
+suppressPackageStartupMessages(library(optparse))
+
+option_list <- list(
+  make_option(
+    c("-o", "--out_dir"),
+    type = "character", default = NULL,
+    help = "Output directory to write files to."
+  )
+)
+
+description <- "Load data for LIBD DLPFC (http://research.libd.org/spatialLIBD/)."
+
+opt_parser <- OptionParser(
+  usage = description,
+  option_list = option_list
+)
+opt <- parse_args(opt_parser)
+
+out_dir <- opt$out_dir
+
+## Your code goes here
+suppressPackageStartupMessages(library(spatialLIBD))
+suppressPackageStartupMessages(library(magrittr))
+
+write_SpatialExperiment_to_folder <- function(
+    spe, path, obs_col, label_col = "label", assay_name = "counts") {
+  dir.create(path, showWarnings = FALSE, recursive = TRUE)
+
+  colData(spe)[label_col] %>%
+    as.data.frame() %>%
+    dplyr::filter(!is.na(!!as.symbol(label_col))) %>%
+    write.table(file.path(path, "labels.tsv"), sep = "\t", col.names = NA, quote = FALSE)
+
+  colData(spe)[obs_col] %>%
+    as.data.frame() %>%
+    write.table(file.path(path, "observations.tsv"), sep = "\t", col.names = NA, quote = FALSE)
+
+  rowData(spe) %>%
+    as.data.frame() %>%
+    write.table(file.path(path, "features.tsv"), sep = "\t", col.names = NA, quote = FALSE)
+
+  coords <- spatialCoords(spe)
+  mode(coords) <- "integer"
+  as.data.frame(coords) %>%
+    dplyr::rename(x = "pxl_col_in_fullres", y = "pxl_row_in_fullres") %>%
+    write.table(file.path(path, "coordinates.tsv"), sep = "\t", col.names = NA, quote = FALSE)
+
+  assay(spe, assay_name) %>%
+    t() %>%
+    Matrix::writeMM(file.path(path, "counts.mtx"))
+}
+
+spe <- fetch_data("spe")
+
+keep_cols <- c("sample_id", "subject", "position", "replicate", "discard", "spatialLIBD", "array_row", "array_col")
+
+colData(spe) <- colData(spe)[, keep_cols]
+colnames(colData(spe))[colnames(colData(spe)) == "array_row"] <- "row"
+colnames(colData(spe))[colnames(colData(spe)) == "array_col"] <- "col"
+colnames(colData(spe))[colnames(colData(spe)) == "spatialLIBD"] <- "label"
+
+keep_rows <- c("gene_version", "gene_name", "source", "gene_biotype")
+rowData(spe) <- rowData(spe)[, keep_rows]
+
+patients <- unique(colData(spe)$subject)
+for (patient in patients) {
+  patient_spe <- spe[, spe$subject == patient]
+  samples <- unique(colData(patient_spe)$sample_id)
+  for (sample in samples) {
+    spe_sample <- patient_spe[, patient_spe$sample_id == sample]
+    colData(spe_sample) <- colData(spe_sample)[, c("label", "row", "col")] # "discard"
+    write_SpatialExperiment_to_folder(
+      spe_sample,
+      file.path(out_dir, paste(patient, sample, sep = "_")),
+      obs_col = c("row", "col")
+    )
+  }
+}
+
+sample2patient <- colData(spe)[, c("sample_id", "subject")] %>%
+  as.data.frame() %>%
+  dplyr::distinct() %>%
+  tibble::deframe()
+
+img_links <- c(
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151507_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151508_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151509_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151510_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151669_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151670_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151671_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151672_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151673_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151674_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151675_full_image.tif",
+  "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151676_full_image.tif"
+)
+
+img_links <- tibble::as_tibble(list("link" = img_links)) %>%
+  dplyr::mutate(
+    sample = stringr::str_extract(link, "([^/]+)_full_image.tif$", group = 1),
+    patient = sample2patient[sample],
+    filename = "H_E.tiff"
+  )
+
+options(timeout = 60 * 60)
+purrr::pwalk(img_links, function(link, sample, patient, filename) {
+  download.file(
+    link,
+    file.path(out_dir, paste(patient, sample, sep = "_"), filename),
+    "wget",
+    quiet = TRUE
+  )
+})
+
+purrr::pwalk(img_links, function(link, sample, patient, filename) {
+  json <- file(file.path(out_dir, paste(patient, sample, sep = "_"), "H_E.json"))
+  writeLines(c('{"scale": 1}'), json)
+  close(json)
+})
+
+colData(spe) %>%
+  as.data.frame() %>%
+  dplyr::select(patient = subject, sample = sample_id, position, replicate, label) %>%
+  dplyr::filter(!is.na(label)) %>%
+  dplyr::distinct() %>%
+  dplyr::count(patient, sample, position, replicate) %>%
+  dplyr::rename(n_clusters = n) %>%
+  dplyr::mutate(directory = paste(patient, sample, sep = "_")) %>%
+  `row.names<-`(NULL) %>%
+  write.table(file.path(out_dir, "samples.tsv"), sep = "\t", col.names = NA, quote = FALSE)
+
+json <- file(file.path(out_dir, "experiment.json"))
+writeLines(c('{"technology": "Visium"}'), json)
+close(json)