From e762b0156d8c906f26f73cc809afc6b5d2657b8d Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Fri, 7 Jun 2024 12:14:10 -0500
Subject: [PATCH 01/14] First passing of adding input_assure.

---
 bin/input_assure.py                | 62 ++++++++++++++++++++++++++++++
 conf/modules.config                |  4 ++
 modules/local/input_assure/main.nf | 31 +++++++++++++++
 workflows/gasclustering.nf         | 16 ++++----
 4 files changed, 106 insertions(+), 7 deletions(-)
 create mode 100755 bin/input_assure.py
 create mode 100644 modules/local/input_assure/main.nf

diff --git a/bin/input_assure.py b/bin/input_assure.py
new file mode 100755
index 0000000..5c8365d
--- /dev/null
+++ b/bin/input_assure.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+import json
+import argparse
+import sys
+import csv
+import gzip
+
+def open_file(file_path, mode):
+    # Open a file based on the file extension
+    if file_path.endswith('.gz'):
+        return gzip.open(file_path, mode)
+    else:
+        return open(file_path, mode)
+
+def check_inputs(json_file, sample_id, address, output_error_file):
+    # Define a variable to store the match_status (True or False)
+    with open(json_file, "rt") as f:
+        json_data = json.load(f)
+    match_status = sample_id in json_data
+
+    # Define the original key in the JSON data
+    original_key = list(json_data.keys())[0]
+
+    # Define error message based on meta.address (query or reference)
+    if address == "null":
+        error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+    else:
+        error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+
+    # Write sample ID and JSON key to error report CSV if not matched; include error message
+    if not match_status:
+        with open(output_error_file, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["sample", "JSON_key", "error_message"])
+            writer.writerow([sample_id, original_key, error_message])
+
+    # Update the JSON file with the new sample ID
+        json_data[sample_id] = json_data.pop(original_key)
+        with open(json_file, "wt") as f:
+            json.dump(json_data, f, indent=4)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check sample inputs, force change if ID ≠ KEY, and generate an error report."
+    )
+    parser.add_argument("--input", help="Path to the mlst.json file.", required=True)
+    parser.add_argument(
+        "--sample_id", help="Sample ID to check in the JSON file.", required=True
+    )
+    parser.add_argument(
+        "--address", help="Address to use in the error message.", required=True
+    )
+    parser.add_argument(
+        "--output_error", help="Path to the error report file.", required=True
+    )
+
+    args = parser.parse_args()
+
+    check_inputs(
+        args.input, args.sample_id, args.address, args.output_error
+    )
diff --git a/conf/modules.config b/conf/modules.config
index 9441dab..9e7bba6 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -28,6 +28,10 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    withName: INPUT_ASSURE {
+        fair = true
+    }
+
     withName: GAS_MCLUSTER {
         publishDir = [
             path: { ["${params.outdir}", "${task.cluster_directory_name}"].join(File.separator) },
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
new file mode 100644
index 0000000..e0376ac
--- /dev/null
+++ b/modules/local/input_assure/main.nf
@@ -0,0 +1,31 @@
+process INPUT_ASSURE {
+    tag "Assures Inputs are Consistent"
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'biocontainers/python:3.8.3' }"
+
+    input:
+    tuple val(meta), path(mlst)
+
+    output:
+    tuple val(meta), path(mlst),                                    emit: result
+    tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
+    path("versions.yml"),                                           emit: versions
+
+    script:
+
+    """
+    input_assure.py \\
+        --input ${mlst} \\
+        --sample_id ${meta.id} \\
+        --address ${meta.address} \\
+        --output_error ${meta.id}_error_report.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/workflows/gasclustering.nf b/workflows/gasclustering.nf
index bcf242d..5e0303c 100644
--- a/workflows/gasclustering.nf
+++ b/workflows/gasclustering.nf
@@ -31,6 +31,7 @@ include { PROFILE_DISTS    } from '../modules/local/profile_dists/main'
 include { GAS_MCLUSTER     } from '../modules/local/gas/mcluster/main'
 include { APPEND_METADATA  } from '../modules/local/appendmetadata/main'
 include { ARBOR_VIEW       } from '../modules/local/arborview.nf'
+include { INPUT_ASSURE     } from "../modules/local/input_assure/main"
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,7 +74,12 @@ workflow GASCLUSTERING {
     // NB: `input` corresponds to `params.input` and associated sample sheet schema
     input = Channel.fromSamplesheet("input")
 
-    merged_alleles = input.map{
+    // Make sure the ID in samplesheet / meta.id is the same ID
+    // as the corresponding MLST JSON file:
+    input_assure = INPUT_ASSURE(input)
+    ch_versions = ch_versions.mix(input_assure.versions)
+
+    merged_alleles = input_assure.result.map{
         meta, mlst_files -> mlst_files
     }.collect()
 
@@ -85,7 +91,7 @@ workflow GASCLUSTERING {
             params.metadata_7_header, params.metadata_8_header)
         )
 
-    metadata_rows = input.map{
+    metadata_rows = input_assure.result.map{
         meta, mlst_files -> tuple(meta.id,
         meta.metadata_1, meta.metadata_2, meta.metadata_3, meta.metadata_4,
         meta.metadata_5, meta.metadata_6, meta.metadata_7, meta.metadata_8)
@@ -109,7 +115,7 @@ workflow GASCLUSTERING {
         exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string"
     }
 
-    gm_thresholds_list = params.gm_thresholds.split(',')
+    gm_thresholds_list = params.gm_thresholds.toString().split(',')
     if (params.pd_distm == 'hamming') {
         if (gm_thresholds_list.any { it != null && it.contains('.') }) {
             exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions."
@@ -133,10 +139,6 @@ workflow GASCLUSTERING {
     clustered_data = GAS_MCLUSTER(distances.results)
     ch_versions = ch_versions.mix(clustered_data.versions)
 
-    /* TODO contextual data is not meant to be the clusters.tsv file output by GAS_MCLUSTER but
-    it is simply a place holder showing how the module is intended to be used for later re-factoring
-    */
-
     data_and_metadata = APPEND_METADATA(clustered_data.clusters, metadata_rows, metadata_headers)
     tree_data = clustered_data.tree.merge(data_and_metadata) // mergeing as no key to join on
 

From d051cb5a70538eeedb2e7a6a9f4317ed54d19fdd Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Fri, 7 Jun 2024 14:26:47 -0500
Subject: [PATCH 02/14] Adding test for mismatched sample IDs.

---
 ...d_clusters_and_metadata-mismatched-ids.tsv |  4 +
 .../expected_dists-mismatched-ids.tsv         |  4 +
 .../data/clusters/expected_mismatched-ids.tsv |  4 +
 .../clusters/expected_tree-mismatched-ids.nwk |  1 +
 .../expected_dists-mismatched-ids.tsv         |  4 +
 .../expected-profile-mismatched-ids.tsv       |  4 +
 tests/data/reports/sample_mismatch.mlst.json  |  7 ++
 .../samplesheet-loci-mismatch.csv             |  4 +
 .../samplesheet-mismatched-ids.csv            |  4 +
 tests/pipelines/main.nf.test                  | 75 +++++++++++++++++++
 10 files changed, 111 insertions(+)
 create mode 100644 tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
 create mode 100644 tests/data/clusters/expected_dists-mismatched-ids.tsv
 create mode 100644 tests/data/clusters/expected_mismatched-ids.tsv
 create mode 100644 tests/data/clusters/expected_tree-mismatched-ids.nwk
 create mode 100644 tests/data/distances/expected_dists-mismatched-ids.tsv
 create mode 100644 tests/data/profiles/expected-profile-mismatched-ids.tsv
 create mode 100644 tests/data/reports/sample_mismatch.mlst.json
 create mode 100644 tests/data/samplesheets/samplesheet-loci-mismatch.csv
 create mode 100644 tests/data/samplesheets/samplesheet-mismatched-ids.csv

diff --git a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
new file mode 100644
index 0000000..7f7eb5b
--- /dev/null
+++ b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sampleC	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/clusters/expected_dists-mismatched-ids.tsv b/tests/data/clusters/expected_dists-mismatched-ids.tsv
new file mode 100644
index 0000000..251971f
--- /dev/null
+++ b/tests/data/clusters/expected_dists-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3
+sampleA	1.1.1	1	1	1
+sampleB	1.1.1	1	1	1
+sampleC	2.2.2	2	2	2
diff --git a/tests/data/clusters/expected_mismatched-ids.tsv b/tests/data/clusters/expected_mismatched-ids.tsv
new file mode 100644
index 0000000..251971f
--- /dev/null
+++ b/tests/data/clusters/expected_mismatched-ids.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3
+sampleA	1.1.1	1	1	1
+sampleB	1.1.1	1	1	1
+sampleC	2.2.2	2	2	2
diff --git a/tests/data/clusters/expected_tree-mismatched-ids.nwk b/tests/data/clusters/expected_tree-mismatched-ids.nwk
new file mode 100644
index 0000000..d2f4909
--- /dev/null
+++ b/tests/data/clusters/expected_tree-mismatched-ids.nwk
@@ -0,0 +1 @@
+((sampleB:0.000000,sampleA:0.000000):16.666666666666668,sampleC:33.333333);
diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv
new file mode 100644
index 0000000..1a64f2b
--- /dev/null
+++ b/tests/data/distances/expected_dists-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+dists	sampleA	sampleB	sampleC
+sampleA	0.0	0.0	33.333333333333336
+sampleB	0.0	0.0	33.333333333333336
+sampleC	33.333333333333336	33.333333333333336	0.0
diff --git a/tests/data/profiles/expected-profile-mismatched-ids.tsv b/tests/data/profiles/expected-profile-mismatched-ids.tsv
new file mode 100644
index 0000000..3e1a1d6
--- /dev/null
+++ b/tests/data/profiles/expected-profile-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+sample_id	l1	l2	l3
+sampleA	1	1	1
+sampleB	1	1	1
+sampleC	1	1	2
diff --git a/tests/data/reports/sample_mismatch.mlst.json b/tests/data/reports/sample_mismatch.mlst.json
new file mode 100644
index 0000000..b7b003b
--- /dev/null
+++ b/tests/data/reports/sample_mismatch.mlst.json
@@ -0,0 +1,7 @@
+{
+    "sample_mismatch": {
+        "l1": "2",
+        "mb": "3",
+        "mc": "1"
+    }
+}
diff --git a/tests/data/samplesheets/samplesheet-loci-mismatch.csv b/tests/data/samplesheets/samplesheet-loci-mismatch.csv
new file mode 100644
index 0000000..45808a4
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-loci-mismatch.csv
@@ -0,0 +1,4 @@
+sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,,
+sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,,
+sample_mismatch,/home/eric/projects/gasclustering/tests/data/reports/sample_mismatch.mlst.json,,,,,,,,
diff --git a/tests/data/samplesheets/samplesheet-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-mismatched-ids.csv
new file mode 100644
index 0000000..632768d
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-mismatched-ids.csv
@@ -0,0 +1,4 @@
+sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8
+sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8
+sampleC,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index f982de5..d62ef25 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -427,4 +427,79 @@ nextflow_pipeline {
             assert iridanext_metadata.isEmpty()
         }
     }
+
+    test("Testing mismatched IDs") {
+        // IDs in the sample sheet and IDs in the individual MLST JSON files will not match.
+        // This tests the pipelines ability to handle and correct for this problem.
+
+        tag "mismatch"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched-ids.csv"
+                outdir = "results"
+
+                pd_distm = "scaled"
+                gm_thresholds = "1,0.5,0"
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Check MLST files
+            def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv")
+            def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-mismatched-ids.tsv")
+            assert actual_profile_tsv.text == expected_profile_tsv.text
+
+            // Check computed distance matrix is correct and that the file exists
+            def actual_distances = path("$launchDir/results/distances/profile_dists.results.text")
+            assert actual_distances.exists()
+            def expected_distances = path("$baseDir/tests/data/distances/expected_dists-mismatched-ids.tsv")
+            assert actual_distances.text == expected_distances.text
+
+            // Check computed clusters are correct and exist
+            def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk")
+            def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text")
+            assert actual_tree.exists()
+            assert actual_clusters.exists()
+            def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-mismatched-ids.nwk")
+            def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-mismatched-ids.tsv")
+            assert actual_tree.text == expected_tree.text
+            assert actual_clusters.text ==  expected_clusters.text
+
+            // Check appended metadata is correct:
+            def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv")
+            assert actual_metadata.exists()
+            def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv")
+            assert actual_metadata.text == expected_metadata.text
+
+            // Check that the ArborView output is created
+            def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
+            assert actual_arborview.exists()
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
+
+            // compare IRIDA Next JSON output
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_global = iridanext_json.files.global
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1
+
+            assert iridanext_samples.isEmpty()
+            assert iridanext_metadata.isEmpty()
+        }
+    }
 }

From a85d89c2cb6ca787218602364e9173238c815b4a Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 12 Jun 2024 11:59:08 -0500
Subject: [PATCH 03/14] Updating input_assure script.

---
 bin/input_assure.py | 59 +++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 5c8365d..779e888 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -2,43 +2,62 @@
 
 import json
 import argparse
-import sys
 import csv
 import gzip
 
+
 def open_file(file_path, mode):
     # Open a file based on the file extension
-    if file_path.endswith('.gz'):
+    if file_path.endswith(".gz"):
         return gzip.open(file_path, mode)
     else:
         return open(file_path, mode)
 
+
 def check_inputs(json_file, sample_id, address, output_error_file):
-    # Define a variable to store the match_status (True or False)
-    with open(json_file, "rt") as f:
+    with open_file(json_file, "rt") as f:
         json_data = json.load(f)
+
+    # Define a variable to store the match_status (True or False)
     match_status = sample_id in json_data
 
-    # Define the original key in the JSON data
-    original_key = list(json_data.keys())[0]
+    keys = list(json_data.keys())
+    original_key = keys[0]
 
-    # Define error message based on meta.address (query or reference)
-    if address == "null":
-        error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
-    else:
-        error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+    # Initialize the error message
+    error_message = None
 
-    # Write sample ID and JSON key to error report CSV if not matched; include error message
-    if not match_status:
+    # Check for multiple keys in the JSON file and define error message
+    if len(keys) > 1:
+        # Check if sample_id matches any key
+        if not match_status:
+            error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
+            # Retain only the specified sample ID
+            json_data = {sample_id: json_data.pop(original_key)}
+        else:
+            error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
+            # Remove all keys expect the one matching sample_id
+            json_data = {sample_id: json_data[sample_id]}
+    elif not match_status:
+        # Define error message based on meta.address (query or reference)
+        if address == "null":
+            error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+        else:
+            error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+        # Update the JSON file with the new sample ID
+        json_data[sample_id] = json_data.pop(original_key)
+
+    # Write file containing relevant error messages
+    if error_message:
         with open(output_error_file, "w", newline="") as f:
             writer = csv.writer(f)
             writer.writerow(["sample", "JSON_key", "error_message"])
-            writer.writerow([sample_id, original_key, error_message])
+            writer.writerow([sample_id, keys, error_message])
+
+    # Write the updated JSON data back to the original file
+    with open_file(json_file, "wt") as f:
+        json.dump(json_data, f, indent=4)
 
-    # Update the JSON file with the new sample ID
-        json_data[sample_id] = json_data.pop(original_key)
-        with open(json_file, "wt") as f:
-            json.dump(json_data, f, indent=4)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -57,6 +76,4 @@ def check_inputs(json_file, sample_id, address, output_error_file):
 
     args = parser.parse_args()
 
-    check_inputs(
-        args.input, args.sample_id, args.address, args.output_error
-    )
+    check_inputs(args.input, args.sample_id, args.address, args.output_error)

From 2e0df69c8166ebf6d0e518630d40ebe36f7f6266 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 12 Jun 2024 12:00:41 -0500
Subject: [PATCH 04/14] Removing unused file.

---
 tests/data/samplesheets/samplesheet-loci-mismatch.csv | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 tests/data/samplesheets/samplesheet-loci-mismatch.csv

diff --git a/tests/data/samplesheets/samplesheet-loci-mismatch.csv b/tests/data/samplesheets/samplesheet-loci-mismatch.csv
deleted file mode 100644
index 45808a4..0000000
--- a/tests/data/samplesheets/samplesheet-loci-mismatch.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
-sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,,
-sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,,
-sample_mismatch,/home/eric/projects/gasclustering/tests/data/reports/sample_mismatch.mlst.json,,,,,,,,

From 8bb6d3dd1aec86ea51d1016e75001ca853317029 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Mon, 17 Jun 2024 14:24:34 -0500
Subject: [PATCH 05/14] Updating input_assure

---
 bin/input_assure.py                | 24 +++++++++++++++++-------
 modules/local/input_assure/main.nf |  5 +++--
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 779e888..d99bf2a 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -4,6 +4,7 @@
 import argparse
 import csv
 import gzip
+import sys
 
 
 def open_file(file_path, mode):
@@ -14,21 +15,25 @@ def open_file(file_path, mode):
         return open(file_path, mode)
 
 
-def check_inputs(json_file, sample_id, address, output_error_file):
+def check_inputs(json_file, sample_id, address, output_error_file, output_json_file):
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
 
     # Define a variable to store the match_status (True or False)
     match_status = sample_id in json_data
 
-    keys = list(json_data.keys())
-    original_key = keys[0]
-
     # Initialize the error message
     error_message = None
 
     # Check for multiple keys in the JSON file and define error message
-    if len(keys) > 1:
+    keys = list(json_data.keys())
+    original_key = keys[0] if keys else None
+
+    if len(keys) == 0:
+        error_message = f"{json_file} is completely empty!"
+        print(error_message)
+        sys.exit(1)
+    elif len(keys) > 1:
         # Check if sample_id matches any key
         if not match_status:
             error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
@@ -55,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file):
             writer.writerow([sample_id, keys, error_message])
 
     # Write the updated JSON data back to the original file
-    with open_file(json_file, "wt") as f:
+    with gzip.open(output_json_file, "wt") as f:
         json.dump(json_data, f, indent=4)
 
 
@@ -73,7 +78,12 @@ def check_inputs(json_file, sample_id, address, output_error_file):
     parser.add_argument(
         "--output_error", help="Path to the error report file.", required=True
     )
+    parser.add_argument(
+        "--output_json", help="Path to the MLST JSON file (gzipped).", required=True
+    )
 
     args = parser.parse_args()
 
-    check_inputs(args.input, args.sample_id, args.address, args.output_error)
+    check_inputs(
+        args.input, args.sample_id, args.address, args.output_error, args.output_json
+    )
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index e0376ac..43b7462 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -10,7 +10,7 @@ process INPUT_ASSURE {
     tuple val(meta), path(mlst)
 
     output:
-    tuple val(meta), path(mlst),                                    emit: result
+    tuple val(meta), path("${meta.id}.mlst.json.gz"),               emit: result
     tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
     path("versions.yml"),                                           emit: versions
 
@@ -21,7 +21,8 @@ process INPUT_ASSURE {
         --input ${mlst} \\
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
-        --output_error ${meta.id}_error_report.csv
+        --output_error ${meta.id}_error_report.csv \\
+        --output_json ${meta.id}.mlst.json.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 61c2c927b5dc422986c760f5b1e5fa9aa46e3ff0 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Tue, 18 Jun 2024 09:55:23 -0500
Subject: [PATCH 06/14] Removing unused file.

---
 tests/data/reports/sample_mismatch.mlst.json | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 tests/data/reports/sample_mismatch.mlst.json

diff --git a/tests/data/reports/sample_mismatch.mlst.json b/tests/data/reports/sample_mismatch.mlst.json
deleted file mode 100644
index b7b003b..0000000
--- a/tests/data/reports/sample_mismatch.mlst.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "sample_mismatch": {
-        "l1": "2",
-        "mb": "3",
-        "mc": "1"
-    }
-}

From 34535b98b21f71632612e6667093fdff286d5730 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Tue, 18 Jun 2024 14:58:31 -0500
Subject: [PATCH 07/14] Updating tests.

---
 ...rs_and_metadata-partial-mismatched-ids.tsv |   4 +
 .../expected_dists-partial-mismatched-ids.tsv |   4 +
 .../expected_tree-partial-mismatched-ids.nwk  |   1 +
 .../expected_dists-partial-mismatched-ids.tsv |   4 +
 ...xpected-profile-partial-mismatched-ids.tsv |   4 +
 .../samplesheet-partial-mismatched-ids.csv    |   4 +
 tests/pipelines/main.nf.test                  | 102 ++++++++++++++++++
 7 files changed, 123 insertions(+)
 create mode 100644 tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
 create mode 100644 tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
 create mode 100644 tests/data/clusters/expected_tree-partial-mismatched-ids.nwk
 create mode 100644 tests/data/distances/expected_dists-partial-mismatched-ids.tsv
 create mode 100644 tests/data/profiles/expected-profile-partial-mismatched-ids.tsv
 create mode 100644 tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv

diff --git a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
new file mode 100644
index 0000000..349ca7b
--- /dev/null
+++ b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sample3	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
new file mode 100644
index 0000000..0933a29
--- /dev/null
+++ b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3
+sampleA	1.1.1	1	1	1
+sampleB	1.1.1	1	1	1
+sample3	2.2.2	2	2	2
diff --git a/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk b/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk
new file mode 100644
index 0000000..7b4e386
--- /dev/null
+++ b/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk
@@ -0,0 +1 @@
+((sampleB:0.000000,sampleA:0.000000):16.666666666666668,sample3:33.333333);
diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
new file mode 100644
index 0000000..e7b7940
--- /dev/null
+++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+dists	sampleA	sampleB	sample3
+sampleA	0.0	0.0	33.333333333333336
+sampleB	0.0	0.0	33.333333333333336
+sample3	33.333333333333336	33.333333333333336	0.0
diff --git a/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv b/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv
new file mode 100644
index 0000000..5289227
--- /dev/null
+++ b/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv
@@ -0,0 +1,4 @@
+sample_id	l1	l2	l3
+sampleA	1	1	1
+sampleB	1	1	1
+sample3	1	1	2
diff --git a/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv
new file mode 100644
index 0000000..d5d42f0
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv
@@ -0,0 +1,4 @@
+sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8
+sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8
+sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index d62ef25..0d9d318 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -29,6 +29,14 @@ nextflow_pipeline {
             assert workflow.success
             assert path("$launchDir/results").exists()
 
+            // Check ID correction:
+            def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv")
+            def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv")
+            def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv")
+            assert sampleA_report.exists() == false
+            assert sampleB_report.exists() == false
+            assert sampleC_report.exists() == false
+
             // Check MLST files
             def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv")
             def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv")
@@ -448,6 +456,17 @@ nextflow_pipeline {
             assert workflow.success
             assert path("$launchDir/results").exists()
 
+            // Check ID correction:
+            def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv")
+            def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv")
+            def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv")
+            assert sampleA_report.exists()
+            assert sampleB_report.exists()
+            assert sampleC_report.exists()
+            assert sampleA_report.text.contains("sampleA,['sample1'],Query sampleA ID and JSON key in sample1.mlst.json DO NOT MATCH.")
+            assert sampleB_report.text.contains("sampleB,['sample2'],Query sampleB ID and JSON key in sample2.mlst.json DO NOT MATCH.")
+            assert sampleC_report.text.contains("sampleC,['sample3'],Query sampleC ID and JSON key in sample3.mlst.json DO NOT MATCH.")
+
             // Check MLST files
             def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv")
             def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-mismatched-ids.tsv")
@@ -502,4 +521,87 @@ nextflow_pipeline {
             assert iridanext_metadata.isEmpty()
         }
     }
+
+    test("Testing partially mismatched IDs") {
+
+        tag "partial_mismatch"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv"
+                outdir = "results"
+
+                pd_distm = "scaled"
+                gm_thresholds = "1,0.5,0"
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Check ID correction:
+            def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv")
+            def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv")
+            def sampleC_report = path("$launchDir/results/input/sampleC_error_report.csv")
+            assert sampleA_report.exists()
+            assert sampleB_report.exists()
+            assert sampleC_report.exists() == false
+            assert sampleA_report.text.contains("sampleA,['sample1'],Query sampleA ID and JSON key in sample1.mlst.json DO NOT MATCH.")
+            assert sampleB_report.text.contains("sampleB,['sample2'],Query sampleB ID and JSON key in sample2.mlst.json DO NOT MATCH.")
+
+            // Check MLST files
+            def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv")
+            def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-partial-mismatched-ids.tsv")
+            assert actual_profile_tsv.text == expected_profile_tsv.text
+
+            // Check computed distance matrix is correct and that the file exists
+            def actual_distances = path("$launchDir/results/distances/profile_dists.results.text")
+            assert actual_distances.exists()
+            def expected_distances = path("$baseDir/tests/data/distances/expected_dists-partial-mismatched-ids.tsv")
+            assert actual_distances.text == expected_distances.text
+
+            // Check computed clusters are correct and exist
+            def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk")
+            def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text")
+            assert actual_tree.exists()
+            assert actual_clusters.exists()
+            def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk")
+            def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv")
+            assert actual_tree.text == expected_tree.text
+            assert actual_clusters.text ==  expected_clusters.text
+
+            // Check appended metadata is correct:
+            def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv")
+            assert actual_metadata.exists()
+            def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv")
+            assert actual_metadata.text == expected_metadata.text
+
+            // Check that the ArborView output is created
+            def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
+            assert actual_arborview.exists()
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
+
+            // compare IRIDA Next JSON output
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_global = iridanext_json.files.global
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1
+            assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1
+
+            assert iridanext_samples.isEmpty()
+            assert iridanext_metadata.isEmpty()
+        }
+    }
 }

From e76871362aaebb4fc4bf1040da4890ba0a0dc48e Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Thu, 20 Jun 2024 15:01:50 -0500
Subject: [PATCH 08/14] Updating tests and documentation to correct issues with
 scaled distances.

---
 README.md                                     | 22 +++++++++++++---
 ...d_clusters_and_metadata-mismatched-ids.tsv |  2 +-
 ...rs_and_metadata-partial-mismatched-ids.tsv |  2 +-
 .../append/expected_clusters_and_metadata.tsv |  2 +-
 ..._clusters_and_metadata_little_metadata.tsv |  2 +-
 ...cted_clusters_and_metadata_no_metadata.tsv |  2 +-
 tests/data/clusters/expected_clusters.txt     |  2 +-
 .../expected_dists-mismatched-ids.tsv         |  2 +-
 .../expected_dists-partial-mismatched-ids.tsv |  2 +-
 .../data/clusters/expected_mismatched-ids.tsv |  4 ---
 tests/pipelines/main.nf.test                  | 26 +++++++++----------
 tests/pipelines/main_gm_thresholds.nf.test    |  4 +--
 workflows/gasclustering.nf                    |  4 +--
 13 files changed, 44 insertions(+), 32 deletions(-)
 delete mode 100644 tests/data/clusters/expected_mismatched-ids.tsv

diff --git a/README.md b/README.md
index e41f30a..2ac1417 100644
--- a/README.md
+++ b/README.md
@@ -24,12 +24,28 @@ The main parameters are `--input` as defined above and `--output` for specifying
 
 In order to customize metadata headers, the parameters `--metadata_1_header` through `--metadata_8_header` may be specified. These parameters are used to re-name the headers in the final metadata table from the defaults (e.g., rename `metadata_1` to `country`).
 
-## Profile dists
+## Distance Method and Thresholds
+
+The Genomic Address Service Clustering workflow can use two distance methods: Hamming or scaled.
+
+### Hamming Distances
+
+Hamming distances are integers representing the number of differing loci between two sequences and will range between [0, n], where `n` is the total number of loci. When using Hamming distances, you must specify `--pd_distm hamming` and provide Hamming distance thresholds as integers between [0, n]: `--gm_thresholds "10,5,0"` (10, 5, and 0 loci).
+
+### Scaled Distances
+
+Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. Whening using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci).
+
+### Thresholds
+
+The `--gm_thresholds` parameter is used to set thresholds for each cluster level, which in turn are used to assign cluster codes at each level. When specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"`, all sequences that have no more than 10 loci differences will be assigned the same cluster code for the first level, no more than 5 for the second level, and only sequences that have no loci differences will be assigned the same cluster code for the third level.
+
+## profile_dists
 
 The following can be used to adjust parameters for the [profile_dists][] tool.
 
 - `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _matrix_ (required by [gas mcluster][]).
-- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1.
+- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0.0 and 100.0. Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information.
 - `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1.
 - `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1.
 - `--pd_file_type`: Output format file type. One of _text_ or _parquet_.
@@ -48,7 +64,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool.
 
 The following can be used to adjust parameters for the [gas mcluster][] tool.
 
-- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_).
+- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information.
 - `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_.
 - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`.
 
diff --git a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
index 7f7eb5b..06b8614 100644
--- a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
+++ b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
 sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
 sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sampleC	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+sampleC	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
index 349ca7b..aa6d7ee 100644
--- a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
+++ b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
 sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
 sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sample3	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+sample3	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata.tsv b/tests/data/append/expected_clusters_and_metadata.tsv
index 01489e2..79772b3 100644
--- a/tests/data/append/expected_clusters_and_metadata.tsv
+++ b/tests/data/append/expected_clusters_and_metadata.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3	myheader_1	myheader_2	myheader_3	myheader_4	myheader_5	myheader_6	myheader_7	myheader_8
 sample1	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
 sample2	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sample3	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+sample3	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv
index e3fb358..a940008 100644
--- a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv
+++ b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
 sample1	1.1.1	1	1	1				1.4				
 sample2	1.1.1	1	1	1								
-sample3	2.2.2	2	2	2	3.1	3.2						3.8
+sample3	1.2.2	1	2	2	3.1	3.2						3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv
index 2c4cc3c..8e3f78a 100644
--- a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv
+++ b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
 sample1	1.1.1	1	1	1								
 sample2	1.1.1	1	1	1								
-sample3	2.2.2	2	2	2								
+sample3	1.2.2	1	2	2								
diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt
index c4adfe5..0f639ea 100644
--- a/tests/data/clusters/expected_clusters.txt
+++ b/tests/data/clusters/expected_clusters.txt
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3
 sample1	1.1.1	1	1	1
 sample2	1.1.1	1	1	1
-sample3	2.2.2	2	2	2
+sample3	1.2.2	1	2	2
diff --git a/tests/data/clusters/expected_dists-mismatched-ids.tsv b/tests/data/clusters/expected_dists-mismatched-ids.tsv
index 251971f..ef0c06b 100644
--- a/tests/data/clusters/expected_dists-mismatched-ids.tsv
+++ b/tests/data/clusters/expected_dists-mismatched-ids.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3
 sampleA	1.1.1	1	1	1
 sampleB	1.1.1	1	1	1
-sampleC	2.2.2	2	2	2
+sampleC	1.2.2	1	2	2
diff --git a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
index 0933a29..358fda2 100644
--- a/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
+++ b/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3
 sampleA	1.1.1	1	1	1
 sampleB	1.1.1	1	1	1
-sample3	2.2.2	2	2	2
+sample3	1.2.2	1	2	2
diff --git a/tests/data/clusters/expected_mismatched-ids.tsv b/tests/data/clusters/expected_mismatched-ids.tsv
deleted file mode 100644
index 251971f..0000000
--- a/tests/data/clusters/expected_mismatched-ids.tsv
+++ /dev/null
@@ -1,4 +0,0 @@
-id	address	level_1	level_2	level_3
-sampleA	1.1.1	1	1	1
-sampleB	1.1.1	1	1	1
-sampleC	2.2.2	2	2	2
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 0d9d318..17ac9ae 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -4,7 +4,7 @@ nextflow_pipeline {
     script "main.nf"
 
     test("Small-scale test of full pipeline") {
-        tag "pipeline"
+        tag "pipeline_simple"
 
         when {
             params {
@@ -12,7 +12,7 @@ nextflow_pipeline {
                 outdir = "results"
 
                 pd_distm = "scaled"
-                gm_thresholds = "1,0.5,0"
+                gm_thresholds = "50,20,0"
 
                 metadata_1_header = "myheader_1"
                 metadata_2_header = "myheader_2"
@@ -67,7 +67,7 @@ nextflow_pipeline {
             // Check that the ArborView output is created
             def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
             assert actual_arborview.exists()
-            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmyheader_1\\tmyheader_2\\tmyheader_3\\tmyheader_4\\tmyheader_5\\tmyheader_6\\tmyheader_7\\tmyheader_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsample2\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmyheader_1\\tmyheader_2\\tmyheader_3\\tmyheader_4\\tmyheader_5\\tmyheader_6\\tmyheader_7\\tmyheader_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsample2\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
 
             // compare IRIDA Next JSON output
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
@@ -293,7 +293,7 @@ nextflow_pipeline {
     }
 
     test("Full pipeline with no metadata") {
-        tag "pipeline"
+        tag "pipeline_no_metadata"
 
         when {
             params {
@@ -301,7 +301,7 @@ nextflow_pipeline {
                 outdir = "results"
 
                 pd_distm = "scaled"
-                gm_thresholds = "1,0.5,0"
+                gm_thresholds = "50,20,0"
             }
         }
 
@@ -339,7 +339,7 @@ nextflow_pipeline {
             // Check that the ArborView output is created
             def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
             assert actual_arborview.exists()
-            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t2.2.2\\t2\\t2\\t2\\t\\t\\t\\t\\t\\t\\t\\t\\n")
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\t\\t\\t\\t\\t\\t\\t\\t\\n")
 
             // compare IRIDA Next JSON output
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
@@ -365,7 +365,7 @@ nextflow_pipeline {
     }
 
     test("Full pipeline with little metadata") {
-        tag "pipeline"
+        tag "pipeline_little_metadata"
 
         when {
             params {
@@ -373,7 +373,7 @@ nextflow_pipeline {
                 outdir = "results"
 
                 pd_distm = "scaled"
-                gm_thresholds = "1,0.5,0"
+                gm_thresholds = "50,20,0"
             }
         }
 
@@ -411,7 +411,7 @@ nextflow_pipeline {
             // Check that the ArborView output is created
             def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
             assert actual_arborview.exists()
-            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t1.4\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t\\t\\t\\t\\t\\t3.8\\n")
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t1.4\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t\\t\\t\\t\\t\\t3.8\\n")
 
             // compare IRIDA Next JSON output
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
@@ -448,7 +448,7 @@ nextflow_pipeline {
                 outdir = "results"
 
                 pd_distm = "scaled"
-                gm_thresholds = "1,0.5,0"
+                gm_thresholds = "50,20,0"
             }
         }
 
@@ -497,7 +497,7 @@ nextflow_pipeline {
             // Check that the ArborView output is created
             def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
             assert actual_arborview.exists()
-            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
 
             // compare IRIDA Next JSON output
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
@@ -532,7 +532,7 @@ nextflow_pipeline {
                 outdir = "results"
 
                 pd_distm = "scaled"
-                gm_thresholds = "1,0.5,0"
+                gm_thresholds = "50,20,0"
             }
         }
 
@@ -580,7 +580,7 @@ nextflow_pipeline {
             // Check that the ArborView output is created
             def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html")
             assert actual_arborview.exists()
-            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t2.2.2\\t2\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
+            assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n")
 
             // compare IRIDA Next JSON output
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
diff --git a/tests/pipelines/main_gm_thresholds.nf.test b/tests/pipelines/main_gm_thresholds.nf.test
index 60f8cb1..ee857cd 100644
--- a/tests/pipelines/main_gm_thresholds.nf.test
+++ b/tests/pipelines/main_gm_thresholds.nf.test
@@ -65,14 +65,14 @@ nextflow_pipeline {
                 input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv"
                 outdir = "results"
 
-                gm_thresholds = "0.5,2"
+                gm_thresholds = "200,50"
                 pd_distm = "scaled"
             }
         }
 
         then {
             assert workflow.failed
-            assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 0.5,2' contains thresholds outside of range [0,1]."
+            assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 200,50' contains thresholds outside of range [0, 100]."
                                             + " Please either set '--pd_distm hamming' or adjust the threshold values.")
         }
     }
diff --git a/workflows/gasclustering.nf b/workflows/gasclustering.nf
index 5e0303c..d08a075 100644
--- a/workflows/gasclustering.nf
+++ b/workflows/gasclustering.nf
@@ -122,8 +122,8 @@ workflow GASCLUSTERING {
                     + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.")
         }
     } else if (params.pd_distm == 'scaled') {
-        if (gm_thresholds_list.any { it != null && (it as Float < 0 || it as Float > 1) }) {
-            exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,1]."
+        if (gm_thresholds_list.any { it != null && (it as Float < 0.0 || it as Float > 100.0) }) {
+            exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0, 100]."
                     + " Please either set '--pd_distm hamming' or adjust the threshold values.")
         }
     } else {

From 85ac8e5d769e218ccfdc8b61f41d54865b68d597 Mon Sep 17 00:00:00 2001
From: Eric Marinier <emarinier@gmail.com>
Date: Mon, 24 Jun 2024 09:18:21 -0500
Subject: [PATCH 09/14] Correcting spelling mistake.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2ac1417..97fcc46 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Hamming distances are integers representing the number of differing loci between
 
 ### Scaled Distances
 
-Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. Whening using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci).
+Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. When using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci).
 
 ### Thresholds
 

From 76224b96c09d7c34c784ecd64f10c259158db30f Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Mon, 24 Jun 2024 11:46:08 -0500
Subject: [PATCH 10/14] Updating to version 0.2.0.

---
 CHANGELOG.md    | 10 ++++++++++
 nextflow.config |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c3216e..7658751 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.2.0] - 2024-06-24
+
+### Added
+
+- Support for mismatched IDs between the samplesheet ID and the ID listed in the corresponding allele file.
+
+### Fixed
+
+- The scaled distance thresholds provided when using `--pd_distm scaled` and `--gm_thresholds` are now correctly understood as percentages in the range [0.0, 100.0].
+
 ## [0.1.0] - 2024-05-28
 
 Initial release of the Genomic Address Service Clustering pipeline to be used for distance-based clustering of cg/wgMLST data.
diff --git a/nextflow.config b/nextflow.config
index 9156445..1dad6fb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -226,7 +226,7 @@ manifest {
     description     = """IRIDA Next Genomic Address Service Clustering Pipeline"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.1.0'
+    version         = '0.2.0'
     doi             = ''
     defaultBranch   = 'main'
 }

From b7cfcdb1d0e323bca4ebc6b258b3e156a243da75 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Tue, 25 Jun 2024 12:43:14 -0500
Subject: [PATCH 11/14] Updating date, adding tag.

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7658751..5405516 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.2.0] - 2024-06-24
+## [0.2.0] - 2024-06-26
 
 ### Added
 
@@ -23,3 +23,4 @@ Initial release of the Genomic Address Service Clustering pipeline to be used fo
 - Output of a dendrogram, cluster codes, and visualization using [profile_dists](https://github.com/phac-nml/profile_dists), [gas mcluster](https://github.com/phac-nml/genomic_address_service), and ArborView.
 
 [0.1.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.1.0
+[0.2.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.2.0

From 7dc8a234e69f53f723aa55a620631a4b502c77a4 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 25 Jun 2024 13:56:53 -0500
Subject: [PATCH 12/14] updated arborview output

---
 assets/ArborView.html | 159 +++++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 56 deletions(-)

diff --git a/assets/ArborView.html b/assets/ArborView.html
index 3358738..3b1c1cd 100644
--- a/assets/ArborView.html
+++ b/assets/ArborView.html
@@ -101,6 +101,9 @@
     <link href="https://cdn.jsdelivr.net/npm/bootstrap5-toggle@5.0.4/css/bootstrap5-toggle.min.css" rel="stylesheet">
     <script src="https://cdn.jsdelivr.net/npm/bootstrap5-toggle@5.0.4/js/bootstrap5-toggle.jquery.min.js"></script>
 
+    <!-- Sweet Alerts 2 -->
+    <script src="https://cdn.jsdelivr.net/npm/sweetalert2@11"></script>
+
     
 
     <script type="text/javascript">
@@ -1332,6 +1335,8 @@
 
         let drawTree = (newick_) => {
             $("#tree-splash").remove()
+            $("#tree-selector").remove()
+            $("#tree-upload-button").remove()
             let tree = kn_parse(newick_);
             if(tree_root != null){
                 console.error("A tree is already drawn");
@@ -1382,6 +1387,9 @@
 
 
         let CreateTable = (parsed_data, table_id, table_headers) => {
+            $("#table-splash").remove();
+            $("#metadata-selector-input").remove();
+            $("#metadata-selector").remove();
             let headers = table_headers;
             let table_head = "";
             let table_body = [];
@@ -1430,11 +1438,10 @@
 
             for(const item of array_tuples){ //[field_value, hex_colour]
                 let row_legend_node = $(`<div class="d-flex p-1 border-bottom flex-nowrap align-items-center legend-element"></div>`)
-                console.log(item[0])
                 let field_value=item[0]
                 if(field_value === ''){field_value="Not defined"}
                 row_legend_node.append(`
-                <input type="color" value="${item[1]}" data-sampleID="${value2samples[item[0]].join(',')}"  onchange="changeLengedItemColour(this)" 
+                <input type="color" value="${item[1]}" data-sampleID="${value2samples[item[0]].join(',')}"  onchange="changeLegendItemColour(this)" 
                 style="width: 20px; height:20px; padding: 1px; border: 1px solid black"></input>\n
                 <small onclick="select_deselect_nodes_by_field_value(this, ${column_index})" 
                 style="padding-left:5px">${field_value}</small>`)
@@ -1445,16 +1452,18 @@
 
         };
 
-        let changeLengedItemColour = (div) => {
+
+        let changeLegendItemColour = (div) => {
             let prevColourValueHex = div.getAttribute("value");
             let newColourValueHex = `${div.value}`
             div.setAttribute("value",newColourValueHex);
+            let legend_update = div.nextElementSibling.innerText;
             div.dataset.sampleid.split(',').forEach( sample_id => {
                 let ele = document.querySelector(`#TreeSVG [id="${sample_id}"]`);
                 if(ele !== null){
                     let [elem, circle, text] = SelectedNodes.getNodeData(ele);
                     if(SelectedNodes.nodes.has(text.textContent)){
-                        delete circle.dataset.oldfill
+                        // if selected old fill value needs to be updated
                         circle.dataset.oldfill = newColourValueHex;
                     }else{
                         circle.style.fill = newColourValueHex;
@@ -1466,8 +1475,12 @@
         }
 
         let PopulateGroups = (json_groups, query_key) => {
+
             ORIGINAL_DATA.forEach((value, key) => {
-                json_groups[value[query_key]].push(key);
+                if (value[query_key] in json_groups) {
+                    json_groups[value[query_key]].push(key);
+                }
+                
             });
             return json_groups;
         };
@@ -1524,11 +1537,14 @@
 
             let col_index = ele.id;
             $("#legend_toggle").bootstrapToggle('on')
-            // TODO add a list of headers somewhere to select colouring from
-            let unq_data = data_table.column(col_index).data().unique();
+
+            let unq_data = new Set();
+            ORIGINAL_DATA.forEach((value, key) => {
+                unq_data.add(value[col_index])
+            })
 
             // Need to create groupings of the each set of ID's belonging to each value grouping
-            let break_down_values = unq_data.toArray();
+            let break_down_values = Array.from(unq_data);
             break_down_values.sort()
             
             // Instead of searching the array for ID's using the datatable, I am just going to use a linear look up
@@ -1543,13 +1559,10 @@
                     try{
                         let current_item = document.querySelector(`[id='${x}']`);
                         let [elem, circle, text] = SelectedNodes.getNodeData(current_item);
-                        if(circle.style.fill == SelectedNodes.getSelectedColour()){
-                            delete circle.dataset.oldfill;
+                        if(SelectedNodes.nodes.has(x)){
                             circle.dataset.oldfill = colours[colour_idx];
                         }else{
                             circle.style.fill = colours[colour_idx];
-                            //current_item.setAttribute('data-color-hex', colours[colour_idx])
-                        
                         }
                         
                     }catch(error){
@@ -1586,7 +1599,7 @@
                     reset_zoom_slider()
                     return true;
                 }
-                console.log("No tree to redraw");
+                console.error("No tree to redraw");
             });
 
             // ! Below disables the right click menu for the page
@@ -1683,11 +1696,12 @@
                 METADATA = null;
                 initialize_legend_menu();
                 $("#dropdown_legend").append(create_legend_elements(TABLE_HEADERS));
-                $("#table-splash").remove();
+                //$("#table-splash").remove();
+                //$("#table-splash").remove();
+                //$("#table-splash").remove();
             }
 
             $("#metadata-selector").change((event) => {
-                $("#table-splash").remove();
                 let reader = new FileReader();
                 let metadata_file = event.target.files[0];
 
@@ -1767,7 +1781,7 @@
                     if(DEBUG){
                         console.log("Selected element", identified_point)
                     }
-                    identified_point.childNodes[2].style.fill = "DodgerBlue";
+                    identified_point.childNodes[2].style.fill = clicked_color;
                 }
                 
                 let rect = identified_point.getBBox();
@@ -1784,7 +1798,9 @@
             });
 
             $("#TreeData").on('scroll', '#TreeSVG', (evt) => {
-                console.log("scrolling");
+                if(DEBUG){
+                    console.log("scrolling");
+                }
             });
 
         });
@@ -1879,15 +1895,8 @@
             document.querySelector('#zoom_slider').value = 1
         }    
 
-        select_deselect_nodes_by_field_value = function(legend_node, column_index){
-            // TODO on deselect event not everything deselects alwasys
-            let selected = false;
-            if(legend_node.dataset.selected){
-                delete legend_node.dataset.selected
-            }else{
-                selected = true;
-                legend_node.dataset.selected = true
-            }
+        let select_deselect_nodes_by_field_value = (legend_node, column_index) => {
+
             //find indices of filtered data in a metadata table
             let indexes = data_table.rows( (idx, data, node) => {
                 if(data[column_index] === `${legend_node.textContent}` ){
@@ -1896,34 +1905,39 @@
                     return false
                 }
             } ).indexes()
+            
+            let selected = legend_node.classList.contains('fw-bold') ? true : false
+            
+            if(!selected){
+                legend_node.classList.add('fw-bold');
+            }else{
+                legend_node.classList.remove('fw-bold');
+            }
 
             let selected_node_ids = data_table.cells(indexes,0).data();
             for(var i = 0; i < selected_node_ids.length; i++) {
-                
+                // TODO need to batch select and deselct from this
                 let tree_node = document.querySelector( `[id='${selected_node_ids[i]}']`)
-                //let text = tree_node.lastChild
-                //let circle = tree_node.querySelector('circle')
                 let [ele, circle, text] = SelectedNodes.getNodeData(tree_node);
-                //focus view on the last node in selection
-                if(i === selected_node_ids.length-1 ){
-                    tree_node.scrollIntoView({block: "center", inline:"center"})
-                }
 
-                if(selected){
-                    SelectedNodes.addNodes(tree_node);
-                    legend_node.classList.add('fw-bold')
+                if(!selected){
+                    if(circle.dataset.oldfill){
+                        circle.dataset.oldfill = circle.style.fill;
+                    }
+                    SelectedNodes.addNodes(text);
                 }else{
-                    SelectedNodes.unselectNode(tree_node);
-                    legend_node.classList.remove('fw-bold')
+                    SelectedNodes.unselectNode(text);
                 }    
             }
+            // Bring the terminal node into frame
+            terminal_node = document.querySelector(`[id=${selected_node_ids[selected_node_ids.length-1]}`);
+            terminal_node.scrollIntoView({block: "center", inline: "center"})
             SelectedNodes.drawSelectedNodes();
             
         }
 
         remove_selected_node = function(node_id){
             let trg_node = SelectedNodes.nodes.get(node_id);
-            console.log(trg_node);
             SelectedNodes.unselectNode(trg_node);
             SelectedNodes.drawSelectedNodes();
         }
@@ -1950,9 +1964,14 @@
 
         export_metadata_table = function(e){
             // Export updated metadata
-
+            console.log(e.id)
             if(!ORIGINAL_DATA){
                 console.error("No contextual data imported yet");
+                Swal.fire({
+                    title: "Error",
+                    text: "No metadata table imported yet. Please upload data first!",
+                    icon: "error"
+                    });
                 return false;
             }
             let headers = Array.from(TABLE_HEADERS);
@@ -1985,6 +2004,11 @@
                 document.body.removeChild(elem);
                 window.URL.revokeObjectURL(elem.href); // revoke object url as it wont be cleared till after the browser is closed otherwise
             }
+            Swal.fire({
+                    title: "Exported Metadata",
+                    text: `Exported ${output_text.length-1} rows of metadata saved in ${updated_metadata}`,
+                    icon: "info"
+                    })
             return true;
 
         }
@@ -2072,7 +2096,7 @@
                 let pngUrl = canvas.toDataURL('image/png').replace('image/png', 'octet/stream');
                 //create download link
                 let downloadLink = document.createElement("a");
-                downloadLink.download = 'tree_snapshot_png';
+                downloadLink.download = 'tree_snapshot.png';
                 downloadLink.href = pngUrl;
                 downloadLink.click();
                 downloadLink.remove();
@@ -2127,6 +2151,7 @@
         clear_selected_nodes = function(){
             // Clear selected nodes and un-bold legend text
             SelectedNodes.deselectNodes(); // deselect nodes
+            SelectedNodes.clearNodes();
             // a way to de colour all of these nodes probably exists
             $(".legend-element > .fw-bold").each((it, ele) => {
                 ele.classList.remove("fw-bold");
@@ -2138,12 +2163,16 @@
             let text2copyArray=[]
             
             document.querySelectorAll('#SelectedNodes div>div:first-child').forEach(node => {
-                console.log(node.textContent)
+                if(DEBUG){
+                    console.log(node.textContent)
+                }
                 text2copyArray.push(node.textContent)
             })
             let text2copy = text2copyArray.join('\n');
             let success_msg = `copied ${text2copyArray.length} sample IDs to the clipboard`
-            console.log(text2copy)
+            if(DEBUG){
+                console.log(text2copy);
+            }
             let file = new File(["\ufeff"+text2copy], 'selectedNodes.txt', {type: "text/plain:charset=UTF-8"})
             //create a ObjectURL in order to download the created file
             url = window.URL.createObjectURL(file);
@@ -2220,13 +2249,13 @@
                     <div id="tree_menu_buttons" class="row m-0">
                                 <div class="d-flex flex-row align-items-center p-0 text-left mt-1">
                                     <button class="h-100 flex-grow-1 me-1 btn-sm btn-primary text-break" data-toggle="tooltip" data-placement="top" 
-                                        title="Select a newick file" onclick="document.getElementById('tree-selector').click();" name="newick" id="copy-ids-button">
+                                        title="Select a newick file" onclick="document.getElementById('tree-selector').click();" name="newick" id="tree-upload-button">
                                         <i class="bi bi-upload me-1"></i>Newick <i class="bi bi-tree ml-1"></i>
                                     </button>
                                     <input  type="file" id="tree-selector" name="tree-selector" accept=".nwk, .newick, .treefile" hidden> 
                                     <button class="h-100 flex-grow-1 btn-sm btn-primary text-break" data-toggle="tooltip" data-placement="top" 
                                         title="Select a tab delimited metadata file" onclick="document.getElementById('metadata-selector').click();" 
-                                        id="export-tree-to-svg" id="metadata-selector" name="metadata-selector">
+                                        id="metadata-selector-input" name="metadata-selector-input">
                                         <i class="bi bi-upload me-1"></i>Meta <i class="bi bi-file-spreadsheet ml-1"></i>
                                     </button>
                                     <input type="file"  id="metadata-selector" 
@@ -2249,15 +2278,34 @@
                                         <i class="bi bi-download m-1"></i> PNG
                                     </button>
                                 </div>
-                                
-                                <button data-toggle="tooltip" data-placement="top" title="Export the entire metadata table below." 
-                                class="flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" id="export-metadata-table">
-                                    <i class="bi bi-download me-1"></i> Export Full Table
-                                </button>
-                                <button data-toggle="tooltip" data-placement="top" title="Export the currently selected metadata." class="flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" id="export-metadata-view">
-                                    <i class="bi bi-download me-1"></i> Export Filtered Table
-                                </button>
-                                
+                                <div class="dropdown p-0">
+                                    <button class="w-100 flex-grow-1 mb-1 btn-sm btn-primary dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false">
+                                        <i class="bi bi-download me-1"></i> Export Meta Table
+                                    </button>
+                                    <ul class="dropdown-menu">
+                                        <li><a id="export-metadata-view" data-toggle="tooltip" data-placement="top" title="Export the currently selected metadata." 
+                                        class="dropdown-item flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" id="export-metadata-view">
+                                            <i class="bi bi-download me-1"></i> Export Filtered Table
+                                            </a>
+                                        </li>
+                                        <li><a id="export-metadata-table" data-toggle="tooltip" data-placement="top" title="Export the currently selected metadata." 
+                                            class="dropdown-item flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" id="export-metadata-view">
+                                                <i class="bi bi-download me-1"></i> Export Full Table
+                                            </a>
+                                        </li>
+                                    </ul>
+
+                                    <!--button data-toggle="tooltip" data-placement="top" title="Export the entire metadata table below." 
+                                    class="flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" id="export-metadata-table">
+                                        <i class="bi bi-download me-1"></i> Export Full Table
+                                    </button>
+                                    <button data-toggle="tooltip" data-placement="top" 
+                                    title="Export the currently selected metadata." 
+                                    class="flex-grow-1 me-1 btn-sm btn-primary text-break mb-1" onclick="export_metadata_table(this)" 
+                                    id="export-metadata-view">
+                                        <i class="bi bi-download me-1"></i> Export Filtered Table
+                                    </button-->
+                                </div>
                                 <!--Tree layout slider-->
                                 <div class="d-flex flex-row align-items-center p-0 text-left">
                                     <button data-toggle="tooltip" data-placement="top" title="Redraw the entire tree undoing any changes." 
@@ -2361,5 +2409,4 @@
 
     
     </body>
-</html>
-
+</html>
\ No newline at end of file

From 12f52cb3423ea9da1931cdf49ac3b712cb6722b1 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 25 Jun 2024 16:24:46 -0500
Subject: [PATCH 13/14] lol sorry Eric

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5405516..94f21ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,10 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+- Updated ArborView html to v0.0.7-rc1
+
 ## [0.2.0] - 2024-06-26
 
 ### Added

From 7ac3770b3b69214b335419ce1a268dba86141cb4 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 26 Jun 2024 09:28:46 -0500
Subject: [PATCH 14/14] Changelog, linting failures.

---
 CHANGELOG.md          | 8 ++++----
 assets/ArborView.html | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94f21ba..0e5741b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,16 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
-
-- Updated ArborView html to v0.0.7-rc1
-
 ## [0.2.0] - 2024-06-26
 
 ### Added
 
 - Support for mismatched IDs between the samplesheet ID and the ID listed in the corresponding allele file.
 
+### Changed
+
+- Updated ArborView to v0.0.7-rc1.
+
 ### Fixed
 
 - The scaled distance thresholds provided when using `--pd_distm scaled` and `--gm_thresholds` are now correctly understood as percentages in the range [0.0, 100.0].
diff --git a/assets/ArborView.html b/assets/ArborView.html
index 3b1c1cd..938e189 100644
--- a/assets/ArborView.html
+++ b/assets/ArborView.html
@@ -2409,4 +2409,4 @@
 
     
     </body>
-</html>
\ No newline at end of file
+</html>