Merge pull request #71 from phac-nml/INX/DBUpdate

updated default databases to null
phac-nml · May 2, 2024 · bb93c35 · bb93c35
2 parents af22d3c + 29dfc63
commit bb93c35
Show file tree

Hide file tree

Showing 19 changed files with 3,043 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,17 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v0.1.2 - [2024-05-02]
+
+### Added
+
+### Changed
+
+- Changed default values for database parameters `--dehosting_idx`, `--mash_sketch`, `--kraken2_db`, and `--bakta_db` to null.
+- Enabled checking for existance of database files in JSON Schema to avoid issues with staging non-existent files in Azure.
+- Set `--kraken2_db` to be a required parameter for the pipeline.
+- Hide bakta parameters from IRIDA Next UI.
+
 ## v0.1.1 - [2024-04-22]
 
 ### Added

diff --git a/conf/test.config b/conf/test.config
@@ -25,10 +25,14 @@ params {
 
     platform = "illumina"
 
-    mash.mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+    mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+    mash.mash_sketch = mash_sketch
     mash.min_kmer = 1
 
-    r_contaminants.mega_mm2_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+    dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+    r_contaminants.mega_mm2_idx = dehosting_idx
+    kraken2_db = "${projectDir}/tests/data/kraken2/test"
+    kraken.db = kraken2_db
 
     fastp.args.illumina = "-Q"
     min_reads = 100

diff --git a/nextflow.config b/nextflow.config
@@ -69,10 +69,10 @@ params {
 
 
     // Datasets
-    dehosting_idx = "./databases/PhiPacHum_m2.idx" // mm2 index
-    mash_sketch = "./databases/GTDBSketch_20231003.msh" // Make sure comments are formatted as taxonomic strings
-    bakta_db = "./databases/db-light"
-    kraken2_db = "./databases/k2_standard_20220607/"
+    dehosting_idx = null // mm2 index
+    mash_sketch = null // Make sure comments are formatted as taxonomic strings
+    bakta_db = null
+    kraken2_db = null
     staramr_db = null // Recommended usage is to use the default database in the container
 
 
@@ -1026,7 +1026,7 @@ manifest {
     description     = """Mikrokondo beta"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.1.1'
+    version         = '0.1.2'
     defaultBranch   = 'main'
     doi             = ''
 }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -76,43 +76,46 @@
             "properties": {
                 "dehosting_idx": {
                     "type": "string",
-                    "default": "./databases/PhiPacHum_m2.idx",
                     "description": "Mash sketch used for contamination detection and speciation (Sketch comments must be a taxonomic string similar to what Kraken2 outputs)",
                     "pattern": "^\\S+$",
+                    "exists": true,
                     "format": "file-path"
                 },
                 "mash_sketch": {
                     "type": "string",
-                    "default": "./databases/GTDBSketch_20231003.msh",
                     "description": "Minimpa2 index for dehosting and kitome removal",
                     "pattern": "^\\S+$",
+                    "exists": true,
                     "format": "file-path"
                 },
-                "bakta_db": {
+                "kraken2_db": {
                     "type": "string",
-                    "description": "Database use for bakta, this value is optional as bakta can be skipped",
-                    "default": "./databases/db-light",
+                    "description": "Kraken2 database",
                     "pattern": "^\\S+$",
+                    "exists": true,
                     "format": "directory-path"
                 },
-                "kraken2_db": {
+                "bakta_db": {
                     "type": "string",
-                    "default": "./databases/k2_standard_20220607/",
-                    "description": "Kraken2 database",
+                    "description": "Database use for bakta, this value is optional as bakta can be skipped",
                     "pattern": "^\\S+$",
-                    "format": "directory-path"
+                    "exists": true,
+                    "format": "directory-path",
+                    "hidden": true
                 },
                 "staramr_db": {
                     "type": "string",
                     "description": "It is recommended to use the StarAMR database in the StarAMR container however, an external option can be specified",
                     "pattern": "^\\S+$",
+                    "exists": true,
                     "format": "directory-path",
                     "hidden": true
                 }
             },
             "required": [
                 "dehosting_idx",
-                "mash_sketch"
+                "mash_sketch",
+                "kraken2_db"
             ],
             "description": "The location of databases used by mikrokondo"
         },
@@ -355,8 +358,9 @@
                 },
                 "skip_bakta": {
                     "type": "boolean",
-		    "default": true,
-                    "description": "Skip annotation with Bakta"
+                    "default": true,
+                    "description": "Skip annotation with Bakta",
+                    "hidden": true
                 },
                 "skip_abricate": {
                     "type": "boolean",

diff --git a/subworkflows/local/annotate_genomes.nf b/subworkflows/local/annotate_genomes.nf
@@ -28,7 +28,7 @@ workflow ANNOTATE_GENOMES {
     txt = channel.empty()
     abricate_report = channel.empty()
 
-    if(!params.skip_bakta){
+    if(!params.skip_bakta && params.bakta.db){
         db_file = Channel.value("${params.bakta.db}")
         annotated = BAKTA_ANNOTATE(contig_data, db_file,
             [], [], [], [], [], []) // empty channels for optional arguments

diff --git a/subworkflows/local/clean_reads.nf b/subworkflows/local/clean_reads.nf
@@ -29,7 +29,7 @@ workflow QC_READS {
 
     // TODO add in code to check that there are always enough reads left over after decontamination
     // TODO need to make sure that if one read is unmapped the other is not included as well
-    deconned_reads = REMOVE_CONTAMINANTS(reads, file(params.r_contaminants.mega_mm2_idx), Channel.value(platform_comp))
+    deconned_reads = REMOVE_CONTAMINANTS(reads, params.r_contaminants.mega_mm2_idx ? file(params.r_contaminants.mega_mm2_idx) : error("--dehosting_idx ${params.dehosting_idx} is invalid"), Channel.value(platform_comp))
     versions = versions.mix(REMOVE_CONTAMINANTS.out.versions)
 
 
@@ -139,7 +139,7 @@ workflow QC_READS {
         ch_prepped_reads = filtered_samples // put in un-downsampled reads
     }
 
-    mash_screen_out = MASH_SCREEN(ch_prepped_reads, file(params.mash.mash_sketch))
+    mash_screen_out = MASH_SCREEN(ch_prepped_reads, params.mash.mash_sketch ? file(params.mash.mash_sketch) : error("--mash_sketch ${params.mash_sketch} is invalid"))
 
     versions = versions.mix(mash_screen_out.versions)
 

diff --git a/subworkflows/local/determine_species.nf b/subworkflows/local/determine_species.nf
@@ -20,7 +20,7 @@ workflow DETERMINE_SPECIES {
     versions = Channel.empty()
     if (params.run_kraken){
         log.info "Running kraken2 for contigs classification"
-        KRAKEN(contigs, file(params.kraken.db))
+        KRAKEN(contigs, params.kraken.db ? file(params.kraken.db) : error("--kraken2_db ${params.kraken.db} is invalid"))
 
         // join contigs for classification
         split_contigs = KRAKEN.out.classified_contigs.join(KRAKEN.out.report).join(KRAKEN.out.kraken_output)
@@ -40,7 +40,7 @@ workflow DETERMINE_SPECIES {
 
     }else {
         log.info "Using mash screen for sample classification"
-        MASH_SCREEN(contigs, file(params.mash.mash_sketch))
+        MASH_SCREEN(contigs, params.mash.mash_sketch ? file(params.mash.mash_sketch) : error("--mash_sketch ${params.mash_sketch} is invalid"))
         results = results.mix(MASH_SCREEN.out.mash_data)
 
         parsed = PARSE_MASH(MASH_SCREEN.out.mash_data, Channel.value("top"))

diff --git a/subworkflows/local/split_metagenomic.nf b/subworkflows/local/split_metagenomic.nf
@@ -27,7 +27,7 @@ workflow SPLIT_METAGENOMIC {
     contigs = contigs.map{
         meta, contigs, reads -> tuple(meta, contigs)
     }
-    kraken_out = KRAKEN(contigs, file(params.kraken.db))
+    kraken_out = KRAKEN(contigs, params.kraken.db ? file(params.kraken.db) : error("--kraken_db ${params.kraken2_db} is invalid"))
     staged_kraken_data = kraken_out.classified_contigs.join(kraken_out.report).join(kraken_out.kraken_output)
 
     binned_data = BIN_KRAKEN2(staged_kraken_data, Channel.value(params.kraken_bin.taxonomic_level))

diff --git a/tests/data/kraken2/README.md b/tests/data/kraken2/README.md
@@ -0,0 +1 @@
+Data within this test is taken from the kraken2 test data set here: https://github.com/DerrickWood/kraken2/tree/master/data
diff --git a/tests/data/kraken2/output.k2.cls.fa b/tests/data/kraken2/output.k2.cls.fa
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Data within this test is taken from the kraken2 test data set here: https://github.com/DerrickWood/kraken2/tree/master/data