From e762b0156d8c906f26f73cc809afc6b5d2657b8d Mon Sep 17 00:00:00 2001 From: Eric Date: Fri, 7 Jun 2024 12:14:10 -0500 Subject: [PATCH 01/14] First passing of adding input_assure. --- bin/input_assure.py | 62 ++++++++++++++++++++++++++++++ conf/modules.config | 4 ++ modules/local/input_assure/main.nf | 31 +++++++++++++++ workflows/gasclustering.nf | 16 ++++---- 4 files changed, 106 insertions(+), 7 deletions(-) create mode 100755 bin/input_assure.py create mode 100644 modules/local/input_assure/main.nf diff --git a/bin/input_assure.py b/bin/input_assure.py new file mode 100755 index 0000000..5c8365d --- /dev/null +++ b/bin/input_assure.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +import json +import argparse +import sys +import csv +import gzip + +def open_file(file_path, mode): + # Open a file based on the file extension + if file_path.endswith('.gz'): + return gzip.open(file_path, mode) + else: + return open(file_path, mode) + +def check_inputs(json_file, sample_id, address, output_error_file): + # Define a variable to store the match_status (True or False) + with open(json_file, "rt") as f: + json_data = json.load(f) + match_status = sample_id in json_data + + # Define the original key in the JSON data + original_key = list(json_data.keys())[0] + + # Define error message based on meta.address (query or reference) + if address == "null": + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + else: + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + + # Write sample ID and JSON key to error report CSV if not matched; include error message + if not match_status: + with open(output_error_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["sample", "JSON_key", "error_message"]) + writer.writerow([sample_id, original_key, error_message]) + + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + with open(json_file, "wt") as f: + json.dump(json_data, f, indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." + ) + parser.add_argument("--input", help="Path to the mlst.json file.", required=True) + parser.add_argument( + "--sample_id", help="Sample ID to check in the JSON file.", required=True + ) + parser.add_argument( + "--address", help="Address to use in the error message.", required=True + ) + parser.add_argument( + "--output_error", help="Path to the error report file.", required=True + ) + + args = parser.parse_args() + + check_inputs( + args.input, args.sample_id, args.address, args.output_error + ) diff --git a/conf/modules.config b/conf/modules.config index 9441dab..9e7bba6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: INPUT_ASSURE { + fair = true + } + withName: GAS_MCLUSTER { publishDir = [ path: { ["${params.outdir}", "${task.cluster_directory_name}"].join(File.separator) }, diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf new file mode 100644 index 0000000..e0376ac --- /dev/null +++ b/modules/local/input_assure/main.nf @@ -0,0 +1,31 @@ +process INPUT_ASSURE { + tag "Assures Inputs are Consistent" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(mlst) + + output: + tuple val(meta), path(mlst), emit: result + tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report + path("versions.yml"), emit: versions + + script: + + """ + input_assure.py \\ + --input ${mlst} \\ + --sample_id ${meta.id} \\ + --address ${meta.address} \\ + --output_error ${meta.id}_error_report.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/workflows/gasclustering.nf b/workflows/gasclustering.nf index bcf242d..5e0303c 100644 --- a/workflows/gasclustering.nf +++ b/workflows/gasclustering.nf @@ -31,6 +31,7 @@ include { PROFILE_DISTS } from '../modules/local/profile_dists/main' include { GAS_MCLUSTER } from '../modules/local/gas/mcluster/main' include { APPEND_METADATA } from '../modules/local/appendmetadata/main' include { ARBOR_VIEW } from '../modules/local/arborview.nf' +include { INPUT_ASSURE } from "../modules/local/input_assure/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,7 +74,12 @@ workflow GASCLUSTERING { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") - merged_alleles = input.map{ + // Make sure the ID in samplesheet / meta.id is the same ID + // as the corresponding MLST JSON file: + input_assure = INPUT_ASSURE(input) + ch_versions = ch_versions.mix(input_assure.versions) + + merged_alleles = input_assure.result.map{ meta, mlst_files -> mlst_files }.collect() @@ -85,7 +91,7 @@ workflow GASCLUSTERING { params.metadata_7_header, params.metadata_8_header) ) - metadata_rows = input.map{ + metadata_rows = input_assure.result.map{ meta, mlst_files -> tuple(meta.id, meta.metadata_1, meta.metadata_2, meta.metadata_3, meta.metadata_4, meta.metadata_5, meta.metadata_6, meta.metadata_7, meta.metadata_8) @@ -109,7 +115,7 @@ workflow GASCLUSTERING { exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string" } - gm_thresholds_list = params.gm_thresholds.split(',') + gm_thresholds_list = params.gm_thresholds.toString().split(',') if (params.pd_distm == 'hamming') { if (gm_thresholds_list.any { it != null && it.contains('.') }) { exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions." @@ -133,10 +139,6 @@ workflow GASCLUSTERING { clustered_data = GAS_MCLUSTER(distances.results) ch_versions = ch_versions.mix(clustered_data.versions) - /* TODO contextual data is not meant to be the clusters.tsv file output by GAS_MCLUSTER but - it is simply a place holder showing how the module is intended to be used for later re-factoring - */ - data_and_metadata = APPEND_METADATA(clustered_data.clusters, metadata_rows, metadata_headers) tree_data = clustered_data.tree.merge(data_and_metadata) // mergeing as no key to join on From d051cb5a70538eeedb2e7a6a9f4317ed54d19fdd Mon Sep 17 00:00:00 2001 From: Eric Date: Fri, 7 Jun 2024 14:26:47 -0500 Subject: [PATCH 02/14] Adding test for mismatched sample IDs. --- ...d_clusters_and_metadata-mismatched-ids.tsv | 4 + .../expected_dists-mismatched-ids.tsv | 4 + .../data/clusters/expected_mismatched-ids.tsv | 4 + .../clusters/expected_tree-mismatched-ids.nwk | 1 + .../expected_dists-mismatched-ids.tsv | 4 + .../expected-profile-mismatched-ids.tsv | 4 + tests/data/reports/sample_mismatch.mlst.json | 7 ++ .../samplesheet-loci-mismatch.csv | 4 + .../samplesheet-mismatched-ids.csv | 4 + tests/pipelines/main.nf.test | 75 +++++++++++++++++++ 10 files changed, 111 insertions(+) create mode 100644 tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv create mode 100644 tests/data/clusters/expected_dists-mismatched-ids.tsv create mode 100644 tests/data/clusters/expected_mismatched-ids.tsv create mode 100644 tests/data/clusters/expected_tree-mismatched-ids.nwk create mode 100644 tests/data/distances/expected_dists-mismatched-ids.tsv create mode 100644 tests/data/profiles/expected-profile-mismatched-ids.tsv create mode 100644 tests/data/reports/sample_mismatch.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-loci-mismatch.csv create mode 100644 tests/data/samplesheets/samplesheet-mismatched-ids.csv diff --git a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv new file mode 100644 index 0000000..7f7eb5b --- /dev/null +++ b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 +sampleA 1.1.1 1 1 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 +sampleB 1.1.1 1 1 1 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 +sampleC 2.2.2 2 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 diff --git a/tests/data/clusters/expected_dists-mismatched-ids.tsv b/tests/data/clusters/expected_dists-mismatched-ids.tsv new file mode 100644 index 0000000..251971f --- /dev/null +++ b/tests/data/clusters/expected_dists-mismatched-ids.tsv @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 +sampleA 1.1.1 1 1 1 +sampleB 1.1.1 1 1 1 +sampleC 2.2.2 2 2 2 diff --git a/tests/data/clusters/expected_mismatched-ids.tsv b/tests/data/clusters/expected_mismatched-ids.tsv new file mode 100644 index 0000000..251971f --- /dev/null +++ b/tests/data/clusters/expected_mismatched-ids.tsv @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 +sampleA 1.1.1 1 1 1 +sampleB 1.1.1 1 1 1 +sampleC 2.2.2 2 2 2 diff --git a/tests/data/clusters/expected_tree-mismatched-ids.nwk b/tests/data/clusters/expected_tree-mismatched-ids.nwk new file mode 100644 index 0000000..d2f4909 --- /dev/null +++ b/tests/data/clusters/expected_tree-mismatched-ids.nwk @@ -0,0 +1 @@ +((sampleB:0.000000,sampleA:0.000000):16.666666666666668,sampleC:33.333333); diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv new file mode 100644 index 0000000..1a64f2b --- /dev/null +++ b/tests/data/distances/expected_dists-mismatched-ids.tsv @@ -0,0 +1,4 @@ +dists sampleA sampleB sampleC +sampleA 0.0 0.0 33.333333333333336 +sampleB 0.0 0.0 33.333333333333336 +sampleC 33.333333333333336 33.333333333333336 0.0 diff --git a/tests/data/profiles/expected-profile-mismatched-ids.tsv b/tests/data/profiles/expected-profile-mismatched-ids.tsv new file mode 100644 index 0000000..3e1a1d6 --- /dev/null +++ b/tests/data/profiles/expected-profile-mismatched-ids.tsv @@ -0,0 +1,4 @@ +sample_id l1 l2 l3 +sampleA 1 1 1 +sampleB 1 1 1 +sampleC 1 1 2 diff --git a/tests/data/reports/sample_mismatch.mlst.json b/tests/data/reports/sample_mismatch.mlst.json new file mode 100644 index 0000000..b7b003b --- /dev/null +++ b/tests/data/reports/sample_mismatch.mlst.json @@ -0,0 +1,7 @@ +{ + "sample_mismatch": { + "l1": "2", + "mb": "3", + "mc": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet-loci-mismatch.csv b/tests/data/samplesheets/samplesheet-loci-mismatch.csv new file mode 100644 index 0000000..45808a4 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-loci-mismatch.csv @@ -0,0 +1,4 @@ +sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,, +sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, +sample_mismatch,/home/eric/projects/gasclustering/tests/data/reports/sample_mismatch.mlst.json,,,,,,,, diff --git a/tests/data/samplesheets/samplesheet-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-mismatched-ids.csv new file mode 100644 index 0000000..632768d --- /dev/null +++ b/tests/data/samplesheets/samplesheet-mismatched-ids.csv @@ -0,0 +1,4 @@ +sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sampleC,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index f982de5..d62ef25 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -427,4 +427,79 @@ nextflow_pipeline { assert iridanext_metadata.isEmpty() } } + + test("Testing mismatched IDs") { + // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. + // This tests the pipelines ability to handle and correct for this problem. + + tag "mismatch" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched-ids.csv" + outdir = "results" + + pd_distm = "scaled" + gm_thresholds = "1,0.5,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check MLST files + def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-mismatched-ids.tsv") + assert actual_profile_tsv.text == expected_profile_tsv.text + + // Check computed distance matrix is correct and that the file exists + def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") + assert actual_distances.exists() + def expected_distances = path("$baseDir/tests/data/distances/expected_dists-mismatched-ids.tsv") + assert actual_distances.text == expected_distances.text + + // Check computed clusters are correct and exist + def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") + def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") + assert actual_tree.exists() + assert actual_clusters.exists() + def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-mismatched-ids.nwk") + def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-mismatched-ids.tsv") + assert actual_tree.text == expected_tree.text + assert actual_clusters.text == expected_clusters.text + + // Check appended metadata is correct: + def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") + assert actual_metadata.exists() + def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv") + assert actual_metadata.text == expected_metadata.text + + // Check that the ArborView output is created + def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") + assert actual_arborview.exists() + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") + + // compare IRIDA Next JSON output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + + assert iridanext_samples.isEmpty() + assert iridanext_metadata.isEmpty() + } + } } From a85d89c2cb6ca787218602364e9173238c815b4a Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 12 Jun 2024 11:59:08 -0500 Subject: [PATCH 03/14] Updating input_assure script. --- bin/input_assure.py | 59 +++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5c8365d..779e888 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -2,43 +2,62 @@ import json import argparse -import sys import csv import gzip + def open_file(file_path, mode): # Open a file based on the file extension - if file_path.endswith('.gz'): + if file_path.endswith(".gz"): return gzip.open(file_path, mode) else: return open(file_path, mode) + def check_inputs(json_file, sample_id, address, output_error_file): - # Define a variable to store the match_status (True or False) - with open(json_file, "rt") as f: + with open_file(json_file, "rt") as f: json_data = json.load(f) + + # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - # Define the original key in the JSON data - original_key = list(json_data.keys())[0] + keys = list(json_data.keys()) + original_key = keys[0] - # Define error message based on meta.address (query or reference) - if address == "null": - error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." - else: - error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + # Initialize the error message + error_message = None - # Write sample ID and JSON key to error report CSV if not matched; include error message - if not match_status: + # Check for multiple keys in the JSON file and define error message + if len(keys) > 1: + # Check if sample_id matches any key + if not match_status: + error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." + # Retain only the specified sample ID + json_data = {sample_id: json_data.pop(original_key)} + else: + error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" + # Remove all keys expect the one matching sample_id + json_data = {sample_id: json_data[sample_id]} + elif not match_status: + # Define error message based on meta.address (query or reference) + if address == "null": + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + else: + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + + # Write file containing relevant error messages + if error_message: with open(output_error_file, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, original_key, error_message]) + writer.writerow([sample_id, keys, error_message]) + + # Write the updated JSON data back to the original file + with open_file(json_file, "wt") as f: + json.dump(json_data, f, indent=4) - # Update the JSON file with the new sample ID - json_data[sample_id] = json_data.pop(original_key) - with open(json_file, "wt") as f: - json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -57,6 +76,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs( - args.input, args.sample_id, args.address, args.output_error - ) + check_inputs(args.input, args.sample_id, args.address, args.output_error) From 2e0df69c8166ebf6d0e518630d40ebe36f7f6266 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 12 Jun 2024 12:00:41 -0500 Subject: [PATCH 04/14] Removing unused file. --- tests/data/samplesheets/samplesheet-loci-mismatch.csv | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 tests/data/samplesheets/samplesheet-loci-mismatch.csv diff --git a/tests/data/samplesheets/samplesheet-loci-mismatch.csv b/tests/data/samplesheets/samplesheet-loci-mismatch.csv deleted file mode 100644 index 45808a4..0000000 --- a/tests/data/samplesheets/samplesheet-loci-mismatch.csv +++ /dev/null @@ -1,4 +0,0 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, -sample_mismatch,/home/eric/projects/gasclustering/tests/data/reports/sample_mismatch.mlst.json,,,,,,,, From 8bb6d3dd1aec86ea51d1016e75001ca853317029 Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 17 Jun 2024 14:24:34 -0500 Subject: [PATCH 05/14] Updating input_assure --- bin/input_assure.py | 24 +++++++++++++++++------- modules/local/input_assure/main.nf | 5 +++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 779e888..d99bf2a 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -4,6 +4,7 @@ import argparse import csv import gzip +import sys def open_file(file_path, mode): @@ -14,21 +15,25 @@ def open_file(file_path, mode): return open(file_path, mode) -def check_inputs(json_file, sample_id, address, output_error_file): +def check_inputs(json_file, sample_id, address, output_error_file, output_json_file): with open_file(json_file, "rt") as f: json_data = json.load(f) # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - keys = list(json_data.keys()) - original_key = keys[0] - # Initialize the error message error_message = None # Check for multiple keys in the JSON file and define error message - if len(keys) > 1: + keys = list(json_data.keys()) + original_key = keys[0] if keys else None + + if len(keys) == 0: + error_message = f"{json_file} is completely empty!" + print(error_message) + sys.exit(1) + elif len(keys) > 1: # Check if sample_id matches any key if not match_status: error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." @@ -55,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file): writer.writerow([sample_id, keys, error_message]) # Write the updated JSON data back to the original file - with open_file(json_file, "wt") as f: + with gzip.open(output_json_file, "wt") as f: json.dump(json_data, f, indent=4) @@ -73,7 +78,12 @@ def check_inputs(json_file, sample_id, address, output_error_file): parser.add_argument( "--output_error", help="Path to the error report file.", required=True ) + parser.add_argument( + "--output_json", help="Path to the MLST JSON file (gzipped).", required=True + ) args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) + check_inputs( + args.input, args.sample_id, args.address, args.output_error, args.output_json + ) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index e0376ac..43b7462 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path(mlst), emit: result + tuple val(meta), path("${meta.id}.mlst.json.gz"), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -21,7 +21,8 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv + --output_error ${meta.id}_error_report.csv \\ + --output_json ${meta.id}.mlst.json.gz cat <<-END_VERSIONS > versions.yml "${task.process}": From 61c2c927b5dc422986c760f5b1e5fa9aa46e3ff0 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 18 Jun 2024 09:55:23 -0500 Subject: [PATCH 06/14] Removing unused file. --- tests/data/reports/sample_mismatch.mlst.json | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 tests/data/reports/sample_mismatch.mlst.json diff --git a/tests/data/reports/sample_mismatch.mlst.json b/tests/data/reports/sample_mismatch.mlst.json deleted file mode 100644 index b7b003b..0000000 --- a/tests/data/reports/sample_mismatch.mlst.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "sample_mismatch": { - "l1": "2", - "mb": "3", - "mc": "1" - } -} From 34535b98b21f71632612e6667093fdff286d5730 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 18 Jun 2024 14:58:31 -0500 Subject: [PATCH 07/14] Updating tests. --- ...rs_and_metadata-partial-mismatched-ids.tsv | 4 + .../expected_dists-partial-mismatched-ids.tsv | 4 + .../expected_tree-partial-mismatched-ids.nwk | 1 + .../expected_dists-partial-mismatched-ids.tsv | 4 + ...xpected-profile-partial-mismatched-ids.tsv | 4 + .../samplesheet-partial-mismatched-ids.csv | 4 + tests/pipelines/main.nf.test | 102 ++++++++++++++++++ 7 files changed, 123 insertions(+) create mode 100644 tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv create mode 100644 tests/data/clusters/expected_dists-partial-mismatched-ids.tsv create mode 100644 tests/data/clusters/expected_tree-partial-mismatched-ids.nwk create mode 100644 tests/data/distances/expected_dists-partial-mismatched-ids.tsv create mode 100644 tests/data/profiles/expected-profile-partial-mismatched-ids.tsv create mode 100644 tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv diff --git a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv new file mode 100644 index 0000000..349ca7b --- /dev/null +++ b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 +sampleA 1.1.1 1 1 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 +sampleB 1.1.1 1 1 1 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 +sample3 2.2.2 2 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 diff --git a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv new file mode 100644 index 0000000..0933a29 --- /dev/null +++ b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 +sampleA 1.1.1 1 1 1 +sampleB 1.1.1 1 1 1 +sample3 2.2.2 2 2 2 diff --git a/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk b/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk new file mode 100644 index 0000000..7b4e386 --- /dev/null +++ b/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk @@ -0,0 +1 @@ +((sampleB:0.000000,sampleA:0.000000):16.666666666666668,sample3:33.333333); diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv new file mode 100644 index 0000000..e7b7940 --- /dev/null +++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv @@ -0,0 +1,4 @@ +dists sampleA sampleB sample3 +sampleA 0.0 0.0 33.333333333333336 +sampleB 0.0 0.0 33.333333333333336 +sample3 33.333333333333336 33.333333333333336 0.0 diff --git a/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv b/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv new file mode 100644 index 0000000..5289227 --- /dev/null +++ b/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv @@ -0,0 +1,4 @@ +sample_id l1 l2 l3 +sampleA 1 1 1 +sampleB 1 1 1 +sample3 1 1 2 diff --git a/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv new file mode 100644 index 0000000..d5d42f0 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv @@ -0,0 +1,4 @@ +sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index d62ef25..0d9d318 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -29,6 +29,14 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() + // Check ID correction: + def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv") + def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv") + def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv") + assert sampleA_report.exists() == false + assert sampleB_report.exists() == false + assert sampleC_report.exists() == false + // Check MLST files def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") @@ -448,6 +456,17 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() + // Check ID correction: + def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv") + def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv") + def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv") + assert sampleA_report.exists() + assert sampleB_report.exists() + assert sampleC_report.exists() + assert sampleA_report.text.contains("sampleA,['sample1'],Query sampleA ID and JSON key in sample1.mlst.json DO NOT MATCH.") + assert sampleB_report.text.contains("sampleB,['sample2'],Query sampleB ID and JSON key in sample2.mlst.json DO NOT MATCH.") + assert sampleC_report.text.contains("sampleC,['sample3'],Query sampleC ID and JSON key in sample3.mlst.json DO NOT MATCH.") + // Check MLST files def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-mismatched-ids.tsv") @@ -502,4 +521,87 @@ nextflow_pipeline { assert iridanext_metadata.isEmpty() } } + + test("Testing partially mismatched IDs") { + + tag "partial_mismatch" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv" + outdir = "results" + + pd_distm = "scaled" + gm_thresholds = "1,0.5,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check ID correction: + def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv") + def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv") + def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv") + assert sampleA_report.exists() + assert sampleB_report.exists() + assert sampleC_report.exists() == false + assert sampleA_report.text.contains("sampleA,['sample1'],Query sampleA ID and JSON key in sample1.mlst.json DO NOT MATCH.") + assert sampleB_report.text.contains("sampleB,['sample2'],Query sampleB ID and JSON key in sample2.mlst.json DO NOT MATCH.") + + // Check MLST files + def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv") + assert actual_profile_tsv.text == expected_profile_tsv.text + + // Check computed distance matrix is correct and that the file exists + def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") + assert actual_distances.exists() + def expected_distances = path("$baseDir/tests/data/distances/expected_dists-partial-mismatched-ids.tsv") + assert actual_distances.text == expected_distances.text + + // Check computed clusters are correct and exist + def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") + def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") + assert actual_tree.exists() + assert actual_clusters.exists() + def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk") + def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv") + assert actual_tree.text == expected_tree.text + assert actual_clusters.text == expected_clusters.text + + // Check appended metadata is correct: + def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") + assert actual_metadata.exists() + def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv") + assert actual_metadata.text == expected_metadata.text + + // Check that the ArborView output is created + def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") + assert actual_arborview.exists() + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") + + // compare IRIDA Next JSON output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 + assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + + assert iridanext_samples.isEmpty() + assert iridanext_metadata.isEmpty() + } + } } From e76871362aaebb4fc4bf1040da4890ba0a0dc48e Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 20 Jun 2024 15:01:50 -0500 Subject: [PATCH 08/14] Updating tests and documentation to correct issues with scaled distances. --- README.md | 22 +++++++++++++--- ...d_clusters_and_metadata-mismatched-ids.tsv | 2 +- ...rs_and_metadata-partial-mismatched-ids.tsv | 2 +- .../append/expected_clusters_and_metadata.tsv | 2 +- ..._clusters_and_metadata_little_metadata.tsv | 2 +- ...cted_clusters_and_metadata_no_metadata.tsv | 2 +- tests/data/clusters/expected_clusters.txt | 2 +- .../expected_dists-mismatched-ids.tsv | 2 +- .../expected_dists-partial-mismatched-ids.tsv | 2 +- .../data/clusters/expected_mismatched-ids.tsv | 4 --- tests/pipelines/main.nf.test | 26 +++++++++---------- tests/pipelines/main_gm_thresholds.nf.test | 4 +-- workflows/gasclustering.nf | 4 +-- 13 files changed, 44 insertions(+), 32 deletions(-) delete mode 100644 tests/data/clusters/expected_mismatched-ids.tsv diff --git a/README.md b/README.md index e41f30a..2ac1417 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,28 @@ The main parameters are `--input` as defined above and `--output` for specifying In order to customize metadata headers, the parameters `--metadata_1_header` through `--metadata_8_header` may be specified. These parameters are used to re-name the headers in the final metadata table from the defaults (e.g., rename `metadata_1` to `country`). -## Profile dists +## Distance Method and Thresholds + +The Genomic Address Service Clustering workflow can use two distance methods: Hamming or scaled. + +### Hamming Distances + +Hamming distances are integers representing the number of differing loci between two sequences and will range between [0, n], where `n` is the total number of loci. When using Hamming distances, you must specify `--pd_distm hamming` and provide Hamming distance thresholds as integers between [0, n]: `--gm_thresholds "10,5,0"` (10, 5, and 0 loci). + +### Scaled Distances + +Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. Whening using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). + +### Thresholds + +The `--gm_thresholds` parameter is used to set thresholds for each cluster level, which in turn are used to assign cluster codes at each level. When specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"`, all sequences that have no more than 10 loci differences will be assigned the same cluster code for the first level, no more than 5 for the second level, and only sequences that have no loci differences will be assigned the same cluster code for the third level. + +## profile_dists The following can be used to adjust parameters for the [profile_dists][] tool. - `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _matrix_ (required by [gas mcluster][]). -- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1. +- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0.0 and 100.0. Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. - `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1. - `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1. - `--pd_file_type`: Output format file type. One of _text_ or _parquet_. @@ -48,7 +64,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool. The following can be used to adjust parameters for the [gas mcluster][] tool. -- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). +- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. - `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_. - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. diff --git a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv index 7f7eb5b..06b8614 100644 --- a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv +++ b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 sampleA 1.1.1 1 1 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 sampleB 1.1.1 1 1 1 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 -sampleC 2.2.2 2 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 +sampleC 1.2.2 1 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 diff --git a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv index 349ca7b..aa6d7ee 100644 --- a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv +++ b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 sampleA 1.1.1 1 1 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 sampleB 1.1.1 1 1 1 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 -sample3 2.2.2 2 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 +sample3 1.2.2 1 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 diff --git a/tests/data/append/expected_clusters_and_metadata.tsv b/tests/data/append/expected_clusters_and_metadata.tsv index 01489e2..79772b3 100644 --- a/tests/data/append/expected_clusters_and_metadata.tsv +++ b/tests/data/append/expected_clusters_and_metadata.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 myheader_1 myheader_2 myheader_3 myheader_4 myheader_5 myheader_6 myheader_7 myheader_8 sample1 1.1.1 1 1 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 sample2 1.1.1 1 1 1 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 -sample3 2.2.2 2 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 +sample3 1.2.2 1 2 2 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 diff --git a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv index e3fb358..a940008 100644 --- a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv +++ b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 sample1 1.1.1 1 1 1 1.4 sample2 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 3.1 3.2 3.8 +sample3 1.2.2 1 2 2 3.1 3.2 3.8 diff --git a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv index 2c4cc3c..8e3f78a 100644 --- a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv +++ b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 metadata_1 metadata_2 metadata_3 metadata_4 metadata_5 metadata_6 metadata_7 metadata_8 sample1 1.1.1 1 1 1 sample2 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 +sample3 1.2.2 1 2 2 diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt index c4adfe5..0f639ea 100644 --- a/tests/data/clusters/expected_clusters.txt +++ b/tests/data/clusters/expected_clusters.txt @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 sample1 1.1.1 1 1 1 sample2 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 +sample3 1.2.2 1 2 2 diff --git a/tests/data/clusters/expected_dists-mismatched-ids.tsv b/tests/data/clusters/expected_dists-mismatched-ids.tsv index 251971f..ef0c06b 100644 --- a/tests/data/clusters/expected_dists-mismatched-ids.tsv +++ b/tests/data/clusters/expected_dists-mismatched-ids.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 sampleA 1.1.1 1 1 1 sampleB 1.1.1 1 1 1 -sampleC 2.2.2 2 2 2 +sampleC 1.2.2 1 2 2 diff --git a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv index 0933a29..358fda2 100644 --- a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv +++ b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 sampleA 1.1.1 1 1 1 sampleB 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 +sample3 1.2.2 1 2 2 diff --git a/tests/data/clusters/expected_mismatched-ids.tsv b/tests/data/clusters/expected_mismatched-ids.tsv deleted file mode 100644 index 251971f..0000000 --- a/tests/data/clusters/expected_mismatched-ids.tsv +++ /dev/null @@ -1,4 +0,0 @@ -id address level_1 level_2 level_3 -sampleA 1.1.1 1 1 1 -sampleB 1.1.1 1 1 1 -sampleC 2.2.2 2 2 2 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 0d9d318..17ac9ae 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -4,7 +4,7 @@ nextflow_pipeline { script "main.nf" test("Small-scale test of full pipeline") { - tag "pipeline" + tag "pipeline_simple" when { params { @@ -12,7 +12,7 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "1,0.5,0" + gm_thresholds = "50,20,0" metadata_1_header = "myheader_1" metadata_2_header = "myheader_2" @@ -67,7 +67,7 @@ nextflow_pipeline { // Check that the ArborView output is created def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmyheader_1\\tmyheader_2\\tmyheader_3\\tmyheader_4\\tmyheader_5\\tmyheader_6\\tmyheader_7\\tmyheader_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsample2\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmyheader_1\\tmyheader_2\\tmyheader_3\\tmyheader_4\\tmyheader_5\\tmyheader_6\\tmyheader_7\\tmyheader_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsample2\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -293,7 +293,7 @@ nextflow_pipeline { } test("Full pipeline with no metadata") { - tag "pipeline" + tag "pipeline_no_metadata" when { params { @@ -301,7 +301,7 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "1,0.5,0" + gm_thresholds = "50,20,0" } } @@ -339,7 +339,7 @@ nextflow_pipeline { // Check that the ArborView output is created def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t2.2.2\\t2\\t2\\t2\\t\\t\\t\\t\\t\\t\\t\\t\\n") + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\t\\t\\t\\t\\t\\t\\t\\t\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -365,7 +365,7 @@ nextflow_pipeline { } test("Full pipeline with little metadata") { - tag "pipeline" + tag "pipeline_little_metadata" when { params { @@ -373,7 +373,7 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "1,0.5,0" + gm_thresholds = "50,20,0" } } @@ -411,7 +411,7 @@ nextflow_pipeline { // Check that the ArborView output is created def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t1.4\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t\\t\\t\\t\\t\\t3.8\\n") + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t1.4\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t\\t\\t\\t\\t\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -448,7 +448,7 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "1,0.5,0" + gm_thresholds = "50,20,0" } } @@ -497,7 +497,7 @@ nextflow_pipeline { // Check that the ArborView output is created def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -532,7 +532,7 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "1,0.5,0" + gm_thresholds = "50,20,0" } } @@ -580,7 +580,7 @@ nextflow_pipeline { // Check that the ArborView output is created def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") + assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json diff --git a/tests/pipelines/main_gm_thresholds.nf.test b/tests/pipelines/main_gm_thresholds.nf.test index 60f8cb1..ee857cd 100644 --- a/tests/pipelines/main_gm_thresholds.nf.test +++ b/tests/pipelines/main_gm_thresholds.nf.test @@ -65,14 +65,14 @@ nextflow_pipeline { input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" outdir = "results" - gm_thresholds = "0.5,2" + gm_thresholds = "200,50" pd_distm = "scaled" } } then { assert workflow.failed - assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 0.5,2' contains thresholds outside of range [0,1]." + assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 200,50' contains thresholds outside of range [0, 100]." + " Please either set '--pd_distm hamming' or adjust the threshold values.") } } diff --git a/workflows/gasclustering.nf b/workflows/gasclustering.nf index 5e0303c..d08a075 100644 --- a/workflows/gasclustering.nf +++ b/workflows/gasclustering.nf @@ -122,8 +122,8 @@ workflow GASCLUSTERING { + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") } } else if (params.pd_distm == 'scaled') { - if (gm_thresholds_list.any { it != null && (it as Float < 0 || it as Float > 1) }) { - exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,1]." + if (gm_thresholds_list.any { it != null && (it as Float < 0.0 || it as Float > 100.0) }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0, 100]." + " Please either set '--pd_distm hamming' or adjust the threshold values.") } } else { From 85ac8e5d769e218ccfdc8b61f41d54865b68d597 Mon Sep 17 00:00:00 2001 From: Eric Marinier Date: Mon, 24 Jun 2024 09:18:21 -0500 Subject: [PATCH 09/14] Correcting spelling mistake. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ac1417..97fcc46 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Hamming distances are integers representing the number of differing loci between ### Scaled Distances -Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. Whening using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). +Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. When using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). ### Thresholds From 76224b96c09d7c34c784ecd64f10c259158db30f Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 24 Jun 2024 11:46:08 -0500 Subject: [PATCH 10/14] Updating to version 0.2.0. --- CHANGELOG.md | 10 ++++++++++ nextflow.config | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c3216e..7658751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.0] - 2024-06-24 + +### Added + +- Support for mismatched IDs between the samplesheet ID and the ID listed in the corresponding allele file. + +### Fixed + +- The scaled distance thresholds provided when using `--pd_distm scaled` and `--gm_thresholds` are now correctly understood as percentages in the range [0.0, 100.0]. + ## [0.1.0] - 2024-05-28 Initial release of the Genomic Address Service Clustering pipeline to be used for distance-based clustering of cg/wgMLST data. diff --git a/nextflow.config b/nextflow.config index 9156445..1dad6fb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -226,7 +226,7 @@ manifest { description = """IRIDA Next Genomic Address Service Clustering Pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.1.0' + version = '0.2.0' doi = '' defaultBranch = 'main' } From b7cfcdb1d0e323bca4ebc6b258b3e156a243da75 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 25 Jun 2024 12:43:14 -0500 Subject: [PATCH 11/14] Updating date, adding tag. --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7658751..5405516 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.2.0] - 2024-06-24 +## [0.2.0] - 2024-06-26 ### Added @@ -23,3 +23,4 @@ Initial release of the Genomic Address Service Clustering pipeline to be used fo - Output of a dendrogram, cluster codes, and visualization using [profile_dists](https://github.com/phac-nml/profile_dists), [gas mcluster](https://github.com/phac-nml/genomic_address_service), and ArborView. [0.1.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.1.0 +[0.2.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.2.0 From 7dc8a234e69f53f723aa55a620631a4b502c77a4 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 25 Jun 2024 13:56:53 -0500 Subject: [PATCH 12/14] updated arborview output --- assets/ArborView.html | 159 +++++++++++++++++++++++++++--------------- 1 file changed, 103 insertions(+), 56 deletions(-) diff --git a/assets/ArborView.html b/assets/ArborView.html index 3358738..3b1c1cd 100644 --- a/assets/ArborView.html +++ b/assets/ArborView.html @@ -101,6 +101,9 @@ + + +