From 174080efaf415e9c2d082da551448167fabe5c6f Mon Sep 17 00:00:00 2001 From: pakiessling Date: Wed, 13 Dec 2023 08:20:47 +0000 Subject: [PATCH 1/4] initial script --- data/agkuppe_heart/setup_merfish.py | 81 ++++++++++++++++++++++++++++ data/agkuppe_kidney/setup_merfish.py | 81 ++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 data/agkuppe_heart/setup_merfish.py create mode 100644 data/agkuppe_kidney/setup_merfish.py diff --git a/data/agkuppe_heart/setup_merfish.py b/data/agkuppe_heart/setup_merfish.py new file mode 100644 index 00000000..7717249c --- /dev/null +++ b/data/agkuppe_heart/setup_merfish.py @@ -0,0 +1,81 @@ +import os +import argparse +import shutil +import scipy +import squidpy as sq # Needs to be installed from github not pypi +import numpy as np +import pandas as pd + +def copy_images(input_folder, output_folder): + # Ensure the output folder exists, create if not + os.makedirs(output_folder, exist_ok=True) + + # Get a list of files in the input folder + files = os.listdir(input_folder) + files = [file for file in files if file.endswith(".tif")] + # Copy image + for file in files: + input_path = os.path.join(input_folder, file) + output_path = os.path.join(output_folder, file) + shutil.copy2(input_path, output_path) + print(f"Copied: {input_path} to {output_path}") + + +def convert_data(input_folder, output_folder,ct_file): + os.makedirs(output_folder, exist_ok=True) + adata = sq.read.vizgen(input_folder,counts_file="cell_by_gene.csv",meta_file="cell_metadata.csv") + + # Observations + obs = adata.obs.copy() + obs["selected"] = "true" + if ct_file != None: + print("adding ct") + ct = pd.read_table(ct_file, index_col=0) + ct.index = ct.index.astype("str") + obs["cell_type"] = ct["cell_type"] + obs['cell_type'].fillna('filtered', inplace=True) + + adata.obs["cell_type"] = ct["cell_type"] + adata.obs["cell_type"].fillna('filtered', inplace=True) + + obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") + + # Features + vars = adata.var.copy() + vars["selected"] = "true" + bad_genes = ["eGFP","mCherry2","tdToma"] + vars.loc[vars.index.isin(bad_genes), "selected"] = "false" + vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") + + # Coordinates + coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) + coords.index = adata.obs.index + coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") + + # Matrix + scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) + + # Anndata + adata.write_h5ad(f"{output_folder}/anndata.h5ad") + + +def main(): + # Set up command-line argument parser + parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") + + # Add arguments for input and output folders + parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) + parser.add_argument("--output", help="Path to the output folder",required=True) + parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) + + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function to copy files + convert_data(args.input, args.output,args.ct) + #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) + + +if __name__ == "__main__": + main() diff --git a/data/agkuppe_kidney/setup_merfish.py b/data/agkuppe_kidney/setup_merfish.py new file mode 100644 index 00000000..7717249c --- /dev/null +++ b/data/agkuppe_kidney/setup_merfish.py @@ -0,0 +1,81 @@ +import os +import argparse +import shutil +import scipy +import squidpy as sq # Needs to be installed from github not pypi +import numpy as np +import pandas as pd + +def copy_images(input_folder, output_folder): + # Ensure the output folder exists, create if not + os.makedirs(output_folder, exist_ok=True) + + # Get a list of files in the input folder + files = os.listdir(input_folder) + files = [file for file in files if file.endswith(".tif")] + # Copy image + for file in files: + input_path = os.path.join(input_folder, file) + output_path = os.path.join(output_folder, file) + shutil.copy2(input_path, output_path) + print(f"Copied: {input_path} to {output_path}") + + +def convert_data(input_folder, output_folder,ct_file): + os.makedirs(output_folder, exist_ok=True) + adata = sq.read.vizgen(input_folder,counts_file="cell_by_gene.csv",meta_file="cell_metadata.csv") + + # Observations + obs = adata.obs.copy() + obs["selected"] = "true" + if ct_file != None: + print("adding ct") + ct = pd.read_table(ct_file, index_col=0) + ct.index = ct.index.astype("str") + obs["cell_type"] = ct["cell_type"] + obs['cell_type'].fillna('filtered', inplace=True) + + adata.obs["cell_type"] = ct["cell_type"] + adata.obs["cell_type"].fillna('filtered', inplace=True) + + obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") + + # Features + vars = adata.var.copy() + vars["selected"] = "true" + bad_genes = ["eGFP","mCherry2","tdToma"] + vars.loc[vars.index.isin(bad_genes), "selected"] = "false" + vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") + + # Coordinates + coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) + coords.index = adata.obs.index + coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") + + # Matrix + scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) + + # Anndata + adata.write_h5ad(f"{output_folder}/anndata.h5ad") + + +def main(): + # Set up command-line argument parser + parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") + + # Add arguments for input and output folders + parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) + parser.add_argument("--output", help="Path to the output folder",required=True) + parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) + + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function to copy files + convert_data(args.input, args.output,args.ct) + #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) + + +if __name__ == "__main__": + main() From 8da904478b288503a8b01154a9b93a374cb033cc Mon Sep 17 00:00:00 2001 From: pakiessling Date: Wed, 13 Dec 2023 09:59:35 +0000 Subject: [PATCH 2/4] refactored, unified kidney and heart --- .../.ipynb_checkpoints/agkuppe-checkpoint.yml | 8 ++ .../setup_merfish-checkpoint.py | 123 ++++++++++++++++++ data/agkuppe_heart_kidney/agkuppe.yml | 8 ++ data/agkuppe_heart_kidney/setup_merfish.py | 123 ++++++++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml create mode 100644 data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py create mode 100644 data/agkuppe_heart_kidney/agkuppe.yml create mode 100644 data/agkuppe_heart_kidney/setup_merfish.py diff --git a/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml b/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml new file mode 100644 index 00000000..c1711057 --- /dev/null +++ b/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - python=3.11.6 + - scipy=1.11.4 + - anndata=0.10.3 + - numpy=1.23.4 + - pandas=2.1.3 \ No newline at end of file diff --git a/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py b/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py new file mode 100644 index 00000000..30d78f13 --- /dev/null +++ b/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py @@ -0,0 +1,123 @@ +import os +import argparse +import shutil +import scipy +import anndata +import numpy as np +import pandas as pd +import json + + +BAD_GENES = ["eGFP","mCherry2","tdToma"] +META_DICT = {"technology":"Merfish"} +SAMPLE_INFO = {"patient":"1","sample":"1","position":"0","replicate":"1","n_clusters":"0","directory":f"os.path.basename(args.output)"} +LICENSE = """ +This dataset was created by AG Kuppe at the University Hospital Aachen, Germany. + +It may only be used in the context of the Spacehack 2023 event. + +In case of any questions feel free to contact Paul Kiessling, pakiessling@ukaachen.de. +""" + + +def copy_images(input_folder, output_folder): + # Ensure the output folder exists, create if not + os.makedirs(output_folder, exist_ok=True) + + # Get a list of files in the input folder + files = os.listdir(input_folder) + files = [file for file in files if file.endswith(".tif")] + # Copy image + for file in files: + input_path = os.path.join(input_folder, file) + output_path = os.path.join(output_folder, file) + shutil.copy2(input_path, output_path) + print(f"Copied: {input_path} to {output_path}") + + +def load_into_anndata(input_folder): + data = pd.read_csv(input_folder + "/cell_by_gene.csv", index_col=0, dtype={"cell": str}) + obs = pd.read_csv(input_folder + "/cell_metadata.csv", index_col=0, dtype={"EntityID": str}) + is_gene = ~data.columns.str.lower().str.contains("blank") + adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) + adata.obsm["blank"] = data.loc[:, ~is_gene] + adata = adata[:,~adata.var_names.isin(BAD_GENES)] + adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values + adata.obs["EntityID"] = adata.obs.index + return adata + +def convert_data(input_folder, output_folder,ct_file): + os.makedirs(output_folder, exist_ok=True) + adata = load_into_anndata(input_folder) + + # Observations + obs = adata.obs.copy() + obs["selected"] = "true" + if ct_file != None: + print("adding ct") + ct = pd.read_table(ct_file, index_col=0) + ct.index = ct.index.astype("str") + obs["cell_type"] = ct["cell_type"] + obs['cell_type'].fillna('filtered', inplace=True) + + adata.obs["cell_type"] = ct["cell_type"] + adata.obs["cell_type"].fillna('filtered', inplace=True) + + obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") + + # Features + vars = adata.var.copy() + vars["selected"] = "true" + + vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") + + # Coordinates + coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) + coords.index = adata.obs.index + coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") + + # Matrix + scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) + + # Anndata + adata.write_h5ad(f"{output_folder}/anndata.h5ad") + + +def write_json(dict,output_path): + with open(output_path, 'w') as json_file: + json.dump(dict, json_file) + + + + +def main(): + # Set up command-line argument parser + parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") + + # Add arguments for input and output folders + parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) + parser.add_argument("--output", help="Path to the output folder",required=True) + parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) + + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function to copy files + convert_data(args.input, args.output,args.ct) + #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) + + # write json + write_json(META_DICT,os.path.join(os.path.dirname(args.output), "experiment.json")) + + # write samples.tsv + sample_df = pd.DataFrame.from_dict(SAMPLE_INFO, orient='index').T + output_directory = os.path.dirname(args.output) + sample_df.to_csv(f"{output_directory}/samples.tsv", sep="\t", index_label=False) + + # write LICENSE + with open(f"{os.path.dirname(args.output)}/LICENSE.md", 'w') as file: + file.write(LICENSE) + +if __name__ == "__main__": + main() diff --git a/data/agkuppe_heart_kidney/agkuppe.yml b/data/agkuppe_heart_kidney/agkuppe.yml new file mode 100644 index 00000000..c1711057 --- /dev/null +++ b/data/agkuppe_heart_kidney/agkuppe.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - python=3.11.6 + - scipy=1.11.4 + - anndata=0.10.3 + - numpy=1.23.4 + - pandas=2.1.3 \ No newline at end of file diff --git a/data/agkuppe_heart_kidney/setup_merfish.py b/data/agkuppe_heart_kidney/setup_merfish.py new file mode 100644 index 00000000..30d78f13 --- /dev/null +++ b/data/agkuppe_heart_kidney/setup_merfish.py @@ -0,0 +1,123 @@ +import os +import argparse +import shutil +import scipy +import anndata +import numpy as np +import pandas as pd +import json + + +BAD_GENES = ["eGFP","mCherry2","tdToma"] +META_DICT = {"technology":"Merfish"} +SAMPLE_INFO = {"patient":"1","sample":"1","position":"0","replicate":"1","n_clusters":"0","directory":f"os.path.basename(args.output)"} +LICENSE = """ +This dataset was created by AG Kuppe at the University Hospital Aachen, Germany. + +It may only be used in the context of the Spacehack 2023 event. + +In case of any questions feel free to contact Paul Kiessling, pakiessling@ukaachen.de. +""" + + +def copy_images(input_folder, output_folder): + # Ensure the output folder exists, create if not + os.makedirs(output_folder, exist_ok=True) + + # Get a list of files in the input folder + files = os.listdir(input_folder) + files = [file for file in files if file.endswith(".tif")] + # Copy image + for file in files: + input_path = os.path.join(input_folder, file) + output_path = os.path.join(output_folder, file) + shutil.copy2(input_path, output_path) + print(f"Copied: {input_path} to {output_path}") + + +def load_into_anndata(input_folder): + data = pd.read_csv(input_folder + "/cell_by_gene.csv", index_col=0, dtype={"cell": str}) + obs = pd.read_csv(input_folder + "/cell_metadata.csv", index_col=0, dtype={"EntityID": str}) + is_gene = ~data.columns.str.lower().str.contains("blank") + adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) + adata.obsm["blank"] = data.loc[:, ~is_gene] + adata = adata[:,~adata.var_names.isin(BAD_GENES)] + adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values + adata.obs["EntityID"] = adata.obs.index + return adata + +def convert_data(input_folder, output_folder,ct_file): + os.makedirs(output_folder, exist_ok=True) + adata = load_into_anndata(input_folder) + + # Observations + obs = adata.obs.copy() + obs["selected"] = "true" + if ct_file != None: + print("adding ct") + ct = pd.read_table(ct_file, index_col=0) + ct.index = ct.index.astype("str") + obs["cell_type"] = ct["cell_type"] + obs['cell_type'].fillna('filtered', inplace=True) + + adata.obs["cell_type"] = ct["cell_type"] + adata.obs["cell_type"].fillna('filtered', inplace=True) + + obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") + + # Features + vars = adata.var.copy() + vars["selected"] = "true" + + vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") + + # Coordinates + coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) + coords.index = adata.obs.index + coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") + + # Matrix + scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) + + # Anndata + adata.write_h5ad(f"{output_folder}/anndata.h5ad") + + +def write_json(dict,output_path): + with open(output_path, 'w') as json_file: + json.dump(dict, json_file) + + + + +def main(): + # Set up command-line argument parser + parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") + + # Add arguments for input and output folders + parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) + parser.add_argument("--output", help="Path to the output folder",required=True) + parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) + + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function to copy files + convert_data(args.input, args.output,args.ct) + #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) + + # write json + write_json(META_DICT,os.path.join(os.path.dirname(args.output), "experiment.json")) + + # write samples.tsv + sample_df = pd.DataFrame.from_dict(SAMPLE_INFO, orient='index').T + output_directory = os.path.dirname(args.output) + sample_df.to_csv(f"{output_directory}/samples.tsv", sep="\t", index_label=False) + + # write LICENSE + with open(f"{os.path.dirname(args.output)}/LICENSE.md", 'w') as file: + file.write(LICENSE) + +if __name__ == "__main__": + main() From 00ac60ade060bc87d20279fe2b6e68a27993974c Mon Sep 17 00:00:00 2001 From: pakiessling Date: Wed, 13 Dec 2023 10:01:06 +0000 Subject: [PATCH 3/4] cleanup --- data/agkuppe_heart/setup_merfish.py | 81 ---------- data/agkuppe_kidney/setup_merfish.py | 81 ---------- .../libd_dlpfc-checkpoint.r | 140 ++++++++++++++++++ 3 files changed, 140 insertions(+), 162 deletions(-) delete mode 100644 data/agkuppe_heart/setup_merfish.py delete mode 100644 data/agkuppe_kidney/setup_merfish.py create mode 100755 data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r diff --git a/data/agkuppe_heart/setup_merfish.py b/data/agkuppe_heart/setup_merfish.py deleted file mode 100644 index 7717249c..00000000 --- a/data/agkuppe_heart/setup_merfish.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import argparse -import shutil -import scipy -import squidpy as sq # Needs to be installed from github not pypi -import numpy as np -import pandas as pd - -def copy_images(input_folder, output_folder): - # Ensure the output folder exists, create if not - os.makedirs(output_folder, exist_ok=True) - - # Get a list of files in the input folder - files = os.listdir(input_folder) - files = [file for file in files if file.endswith(".tif")] - # Copy image - for file in files: - input_path = os.path.join(input_folder, file) - output_path = os.path.join(output_folder, file) - shutil.copy2(input_path, output_path) - print(f"Copied: {input_path} to {output_path}") - - -def convert_data(input_folder, output_folder,ct_file): - os.makedirs(output_folder, exist_ok=True) - adata = sq.read.vizgen(input_folder,counts_file="cell_by_gene.csv",meta_file="cell_metadata.csv") - - # Observations - obs = adata.obs.copy() - obs["selected"] = "true" - if ct_file != None: - print("adding ct") - ct = pd.read_table(ct_file, index_col=0) - ct.index = ct.index.astype("str") - obs["cell_type"] = ct["cell_type"] - obs['cell_type'].fillna('filtered', inplace=True) - - adata.obs["cell_type"] = ct["cell_type"] - adata.obs["cell_type"].fillna('filtered', inplace=True) - - obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") - - # Features - vars = adata.var.copy() - vars["selected"] = "true" - bad_genes = ["eGFP","mCherry2","tdToma"] - vars.loc[vars.index.isin(bad_genes), "selected"] = "false" - vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") - - # Coordinates - coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) - coords.index = adata.obs.index - coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") - - # Matrix - scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) - - # Anndata - adata.write_h5ad(f"{output_folder}/anndata.h5ad") - - -def main(): - # Set up command-line argument parser - parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") - - # Add arguments for input and output folders - parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) - parser.add_argument("--output", help="Path to the output folder",required=True) - parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) - - - # Parse the command-line arguments - args = parser.parse_args() - - # Call the function to copy files - convert_data(args.input, args.output,args.ct) - #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) - - -if __name__ == "__main__": - main() diff --git a/data/agkuppe_kidney/setup_merfish.py b/data/agkuppe_kidney/setup_merfish.py deleted file mode 100644 index 7717249c..00000000 --- a/data/agkuppe_kidney/setup_merfish.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import argparse -import shutil -import scipy -import squidpy as sq # Needs to be installed from github not pypi -import numpy as np -import pandas as pd - -def copy_images(input_folder, output_folder): - # Ensure the output folder exists, create if not - os.makedirs(output_folder, exist_ok=True) - - # Get a list of files in the input folder - files = os.listdir(input_folder) - files = [file for file in files if file.endswith(".tif")] - # Copy image - for file in files: - input_path = os.path.join(input_folder, file) - output_path = os.path.join(output_folder, file) - shutil.copy2(input_path, output_path) - print(f"Copied: {input_path} to {output_path}") - - -def convert_data(input_folder, output_folder,ct_file): - os.makedirs(output_folder, exist_ok=True) - adata = sq.read.vizgen(input_folder,counts_file="cell_by_gene.csv",meta_file="cell_metadata.csv") - - # Observations - obs = adata.obs.copy() - obs["selected"] = "true" - if ct_file != None: - print("adding ct") - ct = pd.read_table(ct_file, index_col=0) - ct.index = ct.index.astype("str") - obs["cell_type"] = ct["cell_type"] - obs['cell_type'].fillna('filtered', inplace=True) - - adata.obs["cell_type"] = ct["cell_type"] - adata.obs["cell_type"].fillna('filtered', inplace=True) - - obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") - - # Features - vars = adata.var.copy() - vars["selected"] = "true" - bad_genes = ["eGFP","mCherry2","tdToma"] - vars.loc[vars.index.isin(bad_genes), "selected"] = "false" - vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") - - # Coordinates - coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) - coords.index = adata.obs.index - coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") - - # Matrix - scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) - - # Anndata - adata.write_h5ad(f"{output_folder}/anndata.h5ad") - - -def main(): - # Set up command-line argument parser - parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") - - # Add arguments for input and output folders - parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) - parser.add_argument("--output", help="Path to the output folder",required=True) - parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) - - - # Parse the command-line arguments - args = parser.parse_args() - - # Call the function to copy files - convert_data(args.input, args.output,args.ct) - #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) - - -if __name__ == "__main__": - main() diff --git a/data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r b/data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r new file mode 100755 index 00000000..8dbd7c44 --- /dev/null +++ b/data/libd_dlpfc/.ipynb_checkpoints/libd_dlpfc-checkpoint.r @@ -0,0 +1,140 @@ +#!/usr/bin/env Rscript + +# Author_and_contribution: Niklas Mueller-Boetticher; created script + +suppressPackageStartupMessages(library(optparse)) + +option_list <- list( + make_option( + c("-o", "--out_dir"), + type = "character", default = NULL, + help = "Output directory to write files to." + ) +) + +description <- "Load data for LIBD DLPFC (http://research.libd.org/spatialLIBD/)." + +opt_parser <- OptionParser( + usage = description, + option_list = option_list +) +opt <- parse_args(opt_parser) + +out_dir <- opt$out_dir + +## Your code goes here +suppressPackageStartupMessages(library(spatialLIBD)) +suppressPackageStartupMessages(library(magrittr)) + +write_SpatialExperiment_to_folder <- function( + spe, path, obs_col, label_col = "label", assay_name = "counts") { + dir.create(path, showWarnings = FALSE, recursive = TRUE) + + colData(spe)[label_col] %>% + as.data.frame() %>% + dplyr::filter(!is.na(!!as.symbol(label_col))) %>% + write.table(file.path(path, "labels.tsv"), sep = "\t", col.names = NA, quote = FALSE) + + colData(spe)[obs_col] %>% + as.data.frame() %>% + write.table(file.path(path, "observations.tsv"), sep = "\t", col.names = NA, quote = FALSE) + + rowData(spe) %>% + as.data.frame() %>% + write.table(file.path(path, "features.tsv"), sep = "\t", col.names = NA, quote = FALSE) + + coords <- spatialCoords(spe) + mode(coords) <- "integer" + as.data.frame(coords) %>% + dplyr::rename(x = "pxl_col_in_fullres", y = "pxl_row_in_fullres") %>% + write.table(file.path(path, "coordinates.tsv"), sep = "\t", col.names = NA, quote = FALSE) + + assay(spe, assay_name) %>% + t() %>% + Matrix::writeMM(file.path(path, "counts.mtx")) +} + +spe <- fetch_data("spe") + +keep_cols <- c("sample_id", "subject", "position", "replicate", "discard", "spatialLIBD", "array_row", "array_col") + +colData(spe) <- colData(spe)[, keep_cols] +colnames(colData(spe))[colnames(colData(spe)) == "array_row"] <- "row" +colnames(colData(spe))[colnames(colData(spe)) == "array_col"] <- "col" +colnames(colData(spe))[colnames(colData(spe)) == "spatialLIBD"] <- "label" + +keep_rows <- c("gene_version", "gene_name", "source", "gene_biotype") +rowData(spe) <- rowData(spe)[, keep_rows] + +patients <- unique(colData(spe)$subject) +for (patient in patients) { + patient_spe <- spe[, spe$subject == patient] + samples <- unique(colData(patient_spe)$sample_id) + for (sample in samples) { + spe_sample <- patient_spe[, patient_spe$sample_id == sample] + colData(spe_sample) <- colData(spe_sample)[, c("label", "row", "col")] # "discard" + write_SpatialExperiment_to_folder( + spe_sample, + file.path(out_dir, paste(patient, sample, sep = "_")), + obs_col = c("row", "col") + ) + } +} + +sample2patient <- colData(spe)[, c("sample_id", "subject")] %>% + as.data.frame() %>% + dplyr::distinct() %>% + tibble::deframe() + +img_links <- c( + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151507_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151508_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151509_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151510_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151669_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151670_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151671_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151672_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151673_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151674_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151675_full_image.tif", + "https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/151676_full_image.tif" +) + +img_links <- tibble::as_tibble(list("link" = img_links)) %>% + dplyr::mutate( + sample = stringr::str_extract(link, "([^/]+)_full_image.tif$", group = 1), + patient = sample2patient[sample], + filename = "H_E.tiff" + ) + +options(timeout = 60 * 60) +purrr::pwalk(img_links, function(link, sample, patient, filename) { + download.file( + link, + file.path(out_dir, paste(patient, sample, sep = "_"), filename), + "wget", + quiet = TRUE + ) +}) + +purrr::pwalk(img_links, function(link, sample, patient, filename) { + json <- file(file.path(out_dir, paste(patient, sample, sep = "_"), "H_E.json")) + writeLines(c('{"scale": 1}'), json) + close(json) +}) + +colData(spe) %>% + as.data.frame() %>% + dplyr::select(patient = subject, sample = sample_id, position, replicate, label) %>% + dplyr::filter(!is.na(label)) %>% + dplyr::distinct() %>% + dplyr::count(patient, sample, position, replicate) %>% + dplyr::rename(n_clusters = n) %>% + dplyr::mutate(directory = paste(patient, sample, sep = "_")) %>% + `row.names<-`(NULL) %>% + write.table(file.path(out_dir, "samples.tsv"), sep = "\t", col.names = NA, quote = FALSE) + +json <- file(file.path(out_dir, "experiment.json")) +writeLines(c('{"technology": "Visium"}'), json) +close(json) From fdea12d69b2d5673c88bf551f51ab0cb4ccd3248 Mon Sep 17 00:00:00 2001 From: pakiessling Date: Wed, 13 Dec 2023 10:02:45 +0000 Subject: [PATCH 4/4] cleanup --- .../.ipynb_checkpoints/agkuppe-checkpoint.yml | 8 -- .../setup_merfish-checkpoint.py | 123 ------------------ 2 files changed, 131 deletions(-) delete mode 100644 data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml delete mode 100644 data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py diff --git a/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml b/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml deleted file mode 100644 index c1711057..00000000 --- a/data/agkuppe_heart_kidney/.ipynb_checkpoints/agkuppe-checkpoint.yml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - conda-forge -dependencies: - - python=3.11.6 - - scipy=1.11.4 - - anndata=0.10.3 - - numpy=1.23.4 - - pandas=2.1.3 \ No newline at end of file diff --git a/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py b/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py deleted file mode 100644 index 30d78f13..00000000 --- a/data/agkuppe_heart_kidney/.ipynb_checkpoints/setup_merfish-checkpoint.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import argparse -import shutil -import scipy -import anndata -import numpy as np -import pandas as pd -import json - - -BAD_GENES = ["eGFP","mCherry2","tdToma"] -META_DICT = {"technology":"Merfish"} -SAMPLE_INFO = {"patient":"1","sample":"1","position":"0","replicate":"1","n_clusters":"0","directory":f"os.path.basename(args.output)"} -LICENSE = """ -This dataset was created by AG Kuppe at the University Hospital Aachen, Germany. - -It may only be used in the context of the Spacehack 2023 event. - -In case of any questions feel free to contact Paul Kiessling, pakiessling@ukaachen.de. -""" - - -def copy_images(input_folder, output_folder): - # Ensure the output folder exists, create if not - os.makedirs(output_folder, exist_ok=True) - - # Get a list of files in the input folder - files = os.listdir(input_folder) - files = [file for file in files if file.endswith(".tif")] - # Copy image - for file in files: - input_path = os.path.join(input_folder, file) - output_path = os.path.join(output_folder, file) - shutil.copy2(input_path, output_path) - print(f"Copied: {input_path} to {output_path}") - - -def load_into_anndata(input_folder): - data = pd.read_csv(input_folder + "/cell_by_gene.csv", index_col=0, dtype={"cell": str}) - obs = pd.read_csv(input_folder + "/cell_metadata.csv", index_col=0, dtype={"EntityID": str}) - is_gene = ~data.columns.str.lower().str.contains("blank") - adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) - adata.obsm["blank"] = data.loc[:, ~is_gene] - adata = adata[:,~adata.var_names.isin(BAD_GENES)] - adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values - adata.obs["EntityID"] = adata.obs.index - return adata - -def convert_data(input_folder, output_folder,ct_file): - os.makedirs(output_folder, exist_ok=True) - adata = load_into_anndata(input_folder) - - # Observations - obs = adata.obs.copy() - obs["selected"] = "true" - if ct_file != None: - print("adding ct") - ct = pd.read_table(ct_file, index_col=0) - ct.index = ct.index.astype("str") - obs["cell_type"] = ct["cell_type"] - obs['cell_type'].fillna('filtered', inplace=True) - - adata.obs["cell_type"] = ct["cell_type"] - adata.obs["cell_type"].fillna('filtered', inplace=True) - - obs.to_csv(f"{output_folder}/observations.tsv",sep="\t",index_label="") - - # Features - vars = adata.var.copy() - vars["selected"] = "true" - - vars.to_csv(f"{output_folder}/features.tsv",sep="\t",index_label="") - - # Coordinates - coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) - coords.index = adata.obs.index - coords.to_csv(f"{output_folder}/coordinates.tsv",sep="\t",index_label="") - - # Matrix - scipy.io.mmwrite(f"{output_folder}/counts.mtx",adata.X) - - # Anndata - adata.write_h5ad(f"{output_folder}/anndata.h5ad") - - -def write_json(dict,output_path): - with open(output_path, 'w') as json_file: - json.dump(dict, json_file) - - - - -def main(): - # Set up command-line argument parser - parser = argparse.ArgumentParser(description="Convert Vizgen Merfish Data to Spacehack format.") - - # Add arguments for input and output folders - parser.add_argument("--input", help="Path to the input folder containing Vizgen Merscope output",required=True) - parser.add_argument("--output", help="Path to the output folder",required=True) - parser.add_argument("--ct", help="Path to tsv containing cell-barcode and ct,columname should be 'cell'",required=False) - - - # Parse the command-line arguments - args = parser.parse_args() - - # Call the function to copy files - convert_data(args.input, args.output,args.ct) - #copy_images(os.path.join(args.input_folder, "images"), args.output_folder) - - # write json - write_json(META_DICT,os.path.join(os.path.dirname(args.output), "experiment.json")) - - # write samples.tsv - sample_df = pd.DataFrame.from_dict(SAMPLE_INFO, orient='index').T - output_directory = os.path.dirname(args.output) - sample_df.to_csv(f"{output_directory}/samples.tsv", sep="\t", index_label=False) - - # write LICENSE - with open(f"{os.path.dirname(args.output)}/LICENSE.md", 'w') as file: - file.write(LICENSE) - -if __name__ == "__main__": - main()