Merge pull request #32 from phac-nml/dev

Minor Release: 0.4.0
phac-nml · Nov 7, 2024 · 9d2649c · 9d2649c
2 parents 9bbd787 + 5b13810
commit 9d2649c
Show file tree

Hide file tree

Showing 21 changed files with 213 additions and 52 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -1,6 +1,6 @@
 name: nf-core linting
 # This workflow is triggered on pushes and PRs to the repository.
-# It runs the `nf-core lint` and markdown lint tests to ensure
+# It runs the `nf-core pipelines lint` and markdown lint tests to ensure
 # that the code meets the nf-core guidelines.
 on:
   push:
@@ -41,17 +41,32 @@ jobs:
           python-version: "3.12"
           architecture: "x64"
 
+      - name: read .nf-core.yml
+        uses: pietrobolcato/[email protected]
+        id: read_yml
+        with:
+          config: ${{ github.workspace }}/.nf-core.yml
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core
+          pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }}
+
+      - name: Run nf-core pipelines lint
+        if: ${{ github.base_ref != 'master' }}
+        env:
+          GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+        run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
-      - name: Run nf-core lint
+      - name: Run nf-core pipelines lint --release
+        if: ${{ github.base_ref == 'master' }}
         env:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
-        run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+        run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
       - name: Save PR number
         if: ${{ always() }}

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,5 +1,5 @@
 repository_type: pipeline
-nf_core_version: "2.14.1"
+nf_core_version: "3.0.1"
 lint:
   files_exist:
     - assets/nf-core-gasclustering_logo_light.png
@@ -31,6 +31,9 @@ lint:
     - custom_config
     - manifest.name
     - manifest.homePage
+    - params.max_cpus
+    - params.max_memory
+    - params.max_time
   readme:
     - nextflow_badge
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.4.0] - 2024-11-07
+
+- Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration.
+
+  - `sample_name` special characters (non-alphanumeric with exception of "_" and ".") will be replaced with `"_"`
+  - If no `sample_name` is supplied in the column `sample` will be used
+  - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file
+
+  - Fixed linting issues in CI caused by nf-core 3.0.1
+
 ## [0.3.0] - 2024-09-10
 
 ### Changed
@@ -41,3 +51,4 @@ Initial release of the Genomic Address Service Clustering pipeline to be used fo
 [0.1.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.1.0
 [0.2.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.2.0
 [0.3.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.3.0
+[0.4.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.4.0
diff --git a/README.md b/README.md
@@ -16,6 +16,16 @@ The input to the pipeline is a standard sample sheet (passed as `--input samples
 
 The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). Details on the columns can be found in the [Full samplesheet](docs/usage.md#full-samplesheet) documentation.
 
+## IRIDA-Next Optional Input Configuration
+
+`gasclustering` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name`
+
+`sample_name`: An **optional** column, that overrides `sample` for outputs (filenames and sample names) and reference assembly identification.
+
+`sample_name` allows more flexibility in naming output files or sample identification. Unlike `sample`, `sample_name` is not required to contain unique values. `Nextflow` requires unique sample names, and therefore in the instance of repeat `sample_names`, `sample` will be suffixed to any `sample_name`. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`.
+
+An [example samplesheet](../tests/data/samplesheets/samplesheet-samplename.csv) has been provided with the pipeline.
+
 # Parameters
 
 The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run.

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/gasclustering/main/assets/schema_input.json",
     "title": "phac-nml/gasclustering pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
@@ -10,9 +10,14 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "meta": ["id"],
+                "meta": ["irida_id"],
                 "unique": true,
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
+                "errorMessage": "Sample name must be provided and cannot contain spaces."
+            },
+            "sample_name": {
+                "type": "string",
+                "meta": ["id"],
+                "errorMessage": "Sample name is optional, if provided will replace sample for filenames and outputs"
             },
             "mlst_alleles": {
                 "type": "string",

diff --git a/conf/iridanext.config b/conf/iridanext.config
@@ -5,7 +5,7 @@ iridanext {
         overwrite = true
         validate = true
         files {
-            idkey = "id"
+            idkey = "irida_id"
             global = [
                 "**/ArborView/arborview.clustered_data_arborview.html",
                 "**/clusters/gas.mcluster.clusters.text",

diff --git a/docs/usage.md b/docs/usage.md
@@ -12,7 +12,7 @@ You will need to create a samplesheet with information about the samples you wou
 --input '[path to samplesheet file]'
 ```
 
-### Full samplesheet
+### Full Standard Samplesheet
 
 The input samplesheet must contain 10 columns: `sample`, `mlst_alleles`, `metadata_1`, `metadata_2`, ..., `metadata_8`. The `sample` IDs within a samplesheet should be unique. All other columns outside of the listed above will be ignored.
 
@@ -33,6 +33,30 @@ SAMPLE3,sample3.mlst.subtyping.json.gz,Canada,2021,,,,,,
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+### IRIDA-Next Optional Samplesheet Configuration
+
+`gasclustering` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `mlst_alleles`, `metadata_1`, `metadata_2`, ..., `metadata_8`. The `sample` IDs within a samplesheet should be unique. All other columns outside of the listed above will be ignored.
+
+A final samplesheet file consisting of both single- and paired-end data may look something like the one below.
+
+````console
+
+```csv title="samplesheet.csv"
+sample,sample_name,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+SAMPLE1,S1,sample1.mlst.json.gz,Canada,2024,,,,,,
+SAMPLE2,S2,sample2.mlst.json.gz,USA,2024,,,,,,
+SAMPLE3, ,sample3.mlst.subtyping.json.gz,Canada,2021,,,,,,
+````
+
+| Column                       | Description                                                                                                                                                                                                                                                                                                                      |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`                     | Custom sample name. Samples should be unique within a samplesheet.                                                                                                                                                                                                                                                               |
+| `sample_name`                | Sample name used in outputs (filenames and sample names)                                                                                                                                                                                                                                                                         |
+| `mlst_alleles`               | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). |
+| `metadata_1` to `metadata_8` | Optional metadata values to integrate into the final visualization.                                                                                                                                                                                                                                                              |
+
+An [example samplesheet](../tests/data/samplesheets/samplesheet-addsamplename.csv) has been provided with the pipeline.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

diff --git a/modules/local/appendmetadata/main.nf b/modules/local/appendmetadata/main.nf
@@ -3,11 +3,11 @@ process APPEND_METADATA {
     label 'process_single'
 
     input:
-    val clusters_path  // cluster data as a TSV path
-                        // this needs to be "val", because "path"
-                        // won't stage the file correctly for exec
-    val metadata_rows  // metadata rows (no headers) to be appened, list of lists
-    val metadata_headers  // headers to name the metadata columns
+    val clusters_path       // cluster data as a TSV path
+                            // this needs to be "val", because "path"
+                            // won't stage the file correctly for exec
+    val metadata_rows       // metadata rows (no headers) to be appened, list of lists
+    val metadata_headers    // headers to name the metadata columns
 
     output:
     path("clusters_and_metadata.tsv"), emit: clusters

diff --git a/nextflow.config b/nextflow.config
@@ -229,7 +229,7 @@ manifest {
     description     = """IRIDA Next Genomic Address Service Clustering Pipeline"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.3.0'
+    version         = '0.4.0'
     doi             = ''
     defaultBranch   = 'main'
 }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/gasclustering/main/nextflow_schema.json",
     "title": "phac-nml/gasclustering pipeline parameters",
     "description": "IRIDA Next Example Pipeline",

diff --git a/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
-sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
-sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sampleC	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sampleA	1.1.1	1	1	1	sampleA	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sampleB	1.1.1	1	1	1	sampleB	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sampleC	1.2.2	1	2	2	sampleC	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv b/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
-sampleA	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
-sampleB	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sample3	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sampleA	1.1.1	1	1	1	sampleA	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sampleB	1.1.1	1	1	1	sampleB	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sample3	1.2.2	1	2	2	sample3	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata.tsv b/tests/data/append/expected_clusters_and_metadata.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	myheader_1	myheader_2	myheader_3	myheader_4	myheader_5	myheader_6	myheader_7	myheader_8
-sample1	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
-sample2	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
-sample3	1.2.2	1	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
+id	address	level_1	level_2	level_3	sample	myheader_1	myheader_2	myheader_3	myheader_4	myheader_5	myheader_6	myheader_7	myheader_8
+sample1	1.1.1	1	1	1	sample1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sample2	1.1.1	1	1	1	sample2	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sample3	1.2.2	1	2	2	sample3	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_addsamplename.tsv b/tests/data/append/expected_clusters_and_metadata_addsamplename.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+S_1	1.1.1	1	1	1	sample1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+S2_	1.1.1	1	1	1	sample2	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+S2__sample3	1.2.2	1	2	2	sample3	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_hamming.tsv b/tests/data/append/expected_clusters_and_metadata_hamming.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
-sample1	1.1.1	1	1	1								
-sample2	1.1.2	1	1	2								
-sample3	2.2.3	2	2	3								
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sample1	1.1.1	1	1	1	sample1								
+sample2	1.1.2	1	1	2	sample2								
+sample3	2.2.3	2	2	3	sample3								
diff --git a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
-sample1	1.1.1	1	1	1				1.4				
-sample2	1.1.1	1	1	1								
-sample3	1.2.2	1	2	2	3.1	3.2						3.8
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sample1	1.1.1	1	1	1	sample1				1.4				
+sample2	1.1.1	1	1	1	sample2								
+sample3	1.2.2	1	2	2	sample3	3.1	3.2						3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv
@@ -1,4 +1,4 @@
-id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
-sample1	1.1.1	1	1	1								
-sample2	1.1.1	1	1	1								
-sample3	1.2.2	1	2	2								
+id	address	level_1	level_2	level_3	sample	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sample1	1.1.1	1	1	1	sample1								
+sample2	1.1.1	1	1	1	sample2								
+sample3	1.2.2	1	2	2	sample3								
diff --git a/tests/data/samplesheets/samplesheet-addsamplename.csv b/tests/data/samplesheets/samplesheet-addsamplename.csv
@@ -0,0 +1,4 @@
+sample,sample_name,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+sample1,S 1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8
+sample2,S2#,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8
+sample3,S2_,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8