Merge pull request #20 from phac-nml/dev

Minor Release 1.2.0
phac-nml · Nov 22, 2024 · d766d34 · d766d34
2 parents d8146d7 + aa8d37f
commit d766d34
Show file tree

Hide file tree

Showing 31 changed files with 360 additions and 109 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -11,19 +11,6 @@ on:
     types: [published]
 
 jobs:
-  EditorConfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-node@v3
-
-      - name: Install editorconfig-checker
-        run: npm install -g editorconfig-checker
-
-      - name: Run ECLint check
-        run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|.sra')
-
   Prettier:
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -34,5 +34,8 @@ lint:
     - custom_config
     - manifest.name
     - manifest.homePage
+    - params.max_cpus
+    - params.max_memory
+    - params.max_time
   readme:
     - nextflow_badge
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,3 +3,8 @@ repos:
     rev: "v2.7.1"
     hooks:
       - id: prettier
+  - repo: https://github.com/editorconfig-checker/editorconfig-checker.python
+    rev: "2.7.3"
+    hooks:
+      - id: editorconfig-checker
+        alias: ec
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,22 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.2.0]
+
+### `Changed`
+
+- Modified the template for input csv file to include a `sample_name` column in addition to `sample` in-line with changes to [IRIDA-Next update] as seen with the [speciesabundance pipeline]
+  - If `sample_name` is supplied, then the reads will have `sample_name` prefixed before the accession code
+  - `sample_name` special characters will be replaced with `"_"`
+- Reverted `fasterq-dump` version to 2.11.0 from 3.0.8 due to [issue #865]. Solution proposed by `fetchngs` in [PR #261]
+- Fixed linting issues in CI caused by `nf-core` 3.0.1
+- Updated `nf-test` snapshots and added new tests for `sample_name` feature
+
+[IRIDA-Next update]: https://github.com/phac-nml/irida-next/pull/678
+[speciesabundance pipeline]: https://github.com/phac-nml/speciesabundance/pull/24
+[issue #865]: https://github.com/ncbi/sra-tools/issues/865
+[PR #261]: https://github.com/nf-core/fetchngs/pull/261
+
 ## [1.1.1] - 2024-04-19
 
 ### Added
@@ -29,3 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Initial release of fetchdatairidanext pipeline which will download reads from NCBI/INSDC archives.
+
+[1.2.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.2.0
+[1.1.1]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.1.1
+[1.1.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.1.0
+[1.0.1]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.0.1
+[1.0.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.0.0
diff --git a/README.md b/README.md
@@ -20,10 +20,22 @@ That is, there are two columns:
 
 The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). An example of this file is provided at [assets/samplesheet.csv](assets/samplesheet.csv).
 
+## IRIDA-Next Optional Input Configuration
+
+`fetchdatairidanext` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name`
+
+`sample_name`: An **optional** column, to add the `sample_name` prefix before the accession code.
+
+`sample_name`, allows more flexibility in naming reads. Unlike `sample`, `sample_name` is not required to contain unique values. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`. `sample_name` can be provided without renaming by changing parameters.
+
+An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline.
+
 # Parameters
 
 The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers (or `-profile docker` for docker) and `-r [branch]` to specify which GitHub branch you would like to run.
 
+`--rename_with_samplename` (Default: `true`) When `false`, samplesheet column `sample_name` not used for reads-renaming.
+
 Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schema.json).
 
 # Running

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/main/assets/schema_input.json",
     "title": "phac-nml/fetchdatairidanext pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
@@ -9,11 +9,16 @@
         "properties": {
             "sample": {
                 "type": "string",
-                "pattern": "^\\S+$",
-                "meta": ["id"],
+                "pattern": "^[A-Za-z0-9_.-]+$",
+                "meta": ["irida_id"],
                 "unique": true,
                 "errorMessage": "Sample name must be provided and cannot contain spaces"
             },
+            "sample_name": {
+                "type": "string",
+                "meta": ["id"],
+                "errorMessage": "Optional. Used to override reads filename when used in tools like IRIDA-Next"
+            },
             "insdc_accession": {
                 "type": "string",
                 "pattern": "^(SRR|ERR|DRR)\\S+$",

diff --git a/conf/iridanext.config b/conf/iridanext.config
@@ -5,7 +5,7 @@ iridanext {
         overwrite = true
         validate = true
         files {
-            idkey = "id"
+            idkey = "irida_id"
             global = ["**/prefetch/failures_report.csv"]
             samples = ["**/reads/*.fastq.gz"]
         }

diff --git a/conf/modules.config b/conf/modules.config
@@ -37,5 +37,20 @@ process {
             mode: params.publish_dir_mode,
             pattern: 'reads/*.fastq.gz'
         ]
+        def fasterq_rename = {String sample_name, String accession -> "--outfile ${sample_name}_${accession}"}
+        def add_extension = {String sample_name, String accession -> "${sample_name}_${accession}"}
+
+
+
+        ext.args = {
+            [
+                (meta.id && params.rename_with_samplename) ? fasterq_rename(meta.id, meta.insdc_accession) : ""
+            ].join(" ")
+        }
+        ext.args2 = {
+            [
+                (meta.id && params.rename_with_samplename) ? add_extension(meta.id, meta.insdc_accession) : meta.insdc_accession
+            ].join(" ")
+        }
     }
 }
diff --git a/docs/output.md b/docs/output.md
@@ -29,7 +29,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - `sratools/`
   - Sequence data in SRA format: `INSDC_ACCESSION/INSDC_ACCESSION.sra`
 - `reads/`
-  - Reads in fastq format: `INSDC_ACCESSION.fastq.gz`
+  - Reads in fastq format: `INSDC_ACCESSION.fastq.gz` (or alternatively `SAMPLE_NAME_INSDC_ACCESSION.fastq.gz` if `sample_name` provided)
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -31,6 +31,26 @@ SAMPLE2,SRR13191702
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+### IRIDA-Next Optional Samplesheet Configuration
+
+`fetchdatairidanext` also accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `insdc_accession`. The `sample` column values within a samplesheet should be unique.
+
+A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below:
+
+```console
+sample,sample_name,insdc_accession
+SAMPLE1,S1,ERR1109373
+SAMPLE2,,SRR13191702
+```
+
+| Column            | Description                                                                         |
+| ----------------- | ----------------------------------------------------------------------------------- |
+| `sample`          | Custom sample name. Samples should be unique within a samplesheet.                  |
+| `sample_name`     | Provides custom prefix to read filenames                                            |
+| `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). |
+
+An [example samplesheet](tests/data/add-samplesheet.csv) has been provided with the pipeline.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:
@@ -132,6 +152,10 @@ You can also supply a run name to resume a specific run: `-resume [run-name]`. U
 
 Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information.
 
+### `--rename_with_samplename`
+
+When `sample_name` is included in the sample sheet, it will be prefixed to read filenames (Default: true)
+
 ## Custom configuration
 
 ### Resource requests

diff --git a/modules.json b/modules.json
@@ -15,9 +15,14 @@
                         "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620",
                         "installed_by": ["modules"]
                     },
+                    "sratools/fasterqdump": {
+                        "branch": "master",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"]
+                    },
                     "sratools/prefetch": {
                         "branch": "master",
-                        "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     }
                 }

diff --git a/modules/local/prefetchchecker/main.nf b/modules/local/prefetchchecker/main.nf
@@ -11,11 +11,20 @@ process PREFETCH_CHECKER {
     exec:
     task.workDir.resolve("failures_report.csv").withWriter { writer ->
 
-        writer.writeLine("sample,error_accession")  // header
+        sample_name = false
+        failures.each {
+            if ( it[0].id != null) {
+                sample_name = true
+        }
+        }
 
         // Failures
-        if (failures.size() > 0) {
-            failures.each { writer.writeLine "${it[0].id},${it[1]}" }
+        if (failures.size() > 0 && sample_name) {
+            writer.writeLine("sample,sample_name,error_accession")  // header
+            failures.each { writer.writeLine "${it[0].irida_id},${it[0].id},${it[1]}" }
+        } else {
+            writer.writeLine("sample,error_accession")  // header
+            failures.each { writer.writeLine "${it[0].irida_id},${it[1]}" }
         }
     }
 }
diff --git a/modules/local/sratools/fasterqdump/environment.yml b/modules/local/sratools/fasterqdump/environment.yml
@@ -4,5 +4,5 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - bioconda::sra-tools=3.0.8
+  - bioconda::sra-tools=2.11.0
   - conda-forge::pigz=2.6
diff --git a/modules/local/sratools/fasterqdump/main.nf b/modules/local/sratools/fasterqdump/main.nf
@@ -4,8 +4,8 @@ process SRATOOLS_FASTERQDUMP {
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' :
-        'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' }"
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' :
+        'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }"
 
     input:
     tuple val(meta), path(sra)
@@ -22,6 +22,7 @@ process SRATOOLS_FASTERQDUMP {
     script:
     def args = task.ext.args ?: ''
     def args2 = task.ext.args2 ?: ''
+    def args3 = task.ext.args3 ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     def key_file = ''
 
@@ -46,8 +47,10 @@ process SRATOOLS_FASTERQDUMP {
         ${key_file} \\
         ${sra}
 
+    find reads/ -type f   -name "$args2" -exec mv {} {}.fastq \\;
+
     pigz \\
-        $args2 \\
+        $args3 \\
         --no-name \\
         --processes $task.cpus \\
         reads/*.fastq

diff --git a/modules/nf-core/sratools/prefetch/environment.yml b/modules/nf-core/sratools/prefetch/environment.yml
diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf