🔀 Merge pull request #62 from cnr-ibba/dev

🔖 release v0.6.0
cnr-ibba · Apr 8, 2024 · 055c09d · 055c09d
2 parents 7985147 + 60053a6
commit 055c09d
Show file tree

Hide file tree

Showing 143 changed files with 4,939 additions and 1,194 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -11,72 +11,33 @@ on:
     types: [published]
 
 jobs:
-  EditorConfig:
+  pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
 
-      - uses: actions/setup-node@v4
-
-      - name: Install editorconfig-checker
-        run: npm install -g editorconfig-checker
-
-      - name: Run ECLint check
-        run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile')
-
-  Prettier:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-node@v4
-
-      - name: Install Prettier
-        run: npm install -g prettier
-
-      - name: Run Prettier --check
-        run: prettier --check ${GITHUB_WORKSPACE}
-
-  PythonBlack:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Check code lints with Black
-        uses: psf/black@stable
-
-      # If the above check failed, post a comment on the PR explaining the failure
-      - name: Post PR comment
-        if: failure()
-        uses: mshick/add-pr-comment@v1
+      - name: Set up Python 3.11
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
         with:
-          message: |
-            ## Python linting (`black`) is failing
-
-            To keep the code consistent with lots of contributors, we run automated code consistency checks.
-            To fix this CI test, please run:
-
-            * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black`
-            * Fix formatting errors in your pipeline: `black .`
-
-            Once you push these changes the test should pass, and you can hide this comment :+1:
+          python-version: 3.11
+          cache: "pip"
 
-            We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help!
+      - name: Install pre-commit
+        run: pip install pre-commit
 
-            Thanks again for your contribution!
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          allow-repeats: false
+      - name: Run pre-commit
+        run: pre-commit run --all-files
 
   nf-core:
     runs-on: ubuntu-latest
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@v4
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1
 
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
         with:
           python-version: "3.11"
           architecture: "x64"
@@ -99,7 +60,7 @@ jobs:
 
       - name: Upload linting log file artifact
         if: ${{ always() }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4
         with:
           name: linting-logs
           path: |

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@v2
+        uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3
         with:
           workflow: linting.yml
           workflow_conclusion: completed
@@ -21,7 +21,7 @@ jobs:
         run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT
 
       - name: Post PR comment
-        uses: marocchino/sticky-pull-request-comment@v2
+        uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           number: ${{ steps.pr_number.outputs.pr_number }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v3.1.0"
+    hooks:
+      - id: prettier
+  - repo: https://github.com/editorconfig-checker/editorconfig-checker.python
+    rev: "2.7.3"
+    hooks:
+      - id: editorconfig-checker
+        alias: ec
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,6 +3,7 @@
         "awsbatch",
         "awsqueue",
         "awsregion",
+        "bamaddrg",
         "bcftools",
         "bioinformatics",
         "conda",
@@ -12,15 +13,18 @@
         "downsampling",
         "fasta",
         "fastq",
+        "flagstat",
         "freebayes",
         "ibba",
         "markduplicate",
         "markduplicated",
+        "markduplicates",
         "nextflow",
         "outdir",
         "ploidy",
         "resequencing",
         "samplesheet",
+        "samtools",
         "slurm",
         "SPLITBAM",
         "subworkflow",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,52 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 0.6.0 - [2024-04-04]
+
+- Replace `*.bam` file format with `*.cram` ([#9](https://github.com/cnr-ibba/nf-resequencing-mem/issues/9))
+- Add _Read Groups_ during the alignment step ([#57](https://github.com/cnr-ibba/nf-resequencing-mem/issues/57))
+- Annotate VCF file with SnpEff ([#59](https://github.com/cnr-ibba/nf-resequencing-mem/issues/59))
+- Configure MultiQC analysis ([#60](https://github.com/cnr-ibba/nf-resequencing-mem/issues/60))
+- Update modules ([#64](https://github.com/cnr-ibba/nf-resequencing-mem/issues/64))
+
+### `Added`
+
+- Add `samtools/depth` process
+- Add `freebayes_splitcram` custom module to split genome in regions relying on
+  _total sample coverage_
+- Add `cram_markduplicates_picard` custom local subworkflow by modifying
+  `bam_markduplicates_picard` to work with `*.cram` files by default
+- Add `cram_stats_samtools` custom local subworkflow by modifying
+  `bam_stats_samtools` to work with `*.cram` files by default
+- Add `freebayes_splitcram` local module to splice alignments regions relying
+  on `samtools/depth` step
+- Add `snpeff/download` module
+- Add `snpeff/snpeff` module
+- Add `snpeff_annotate` local subworkflow
+
+### `Fixed`
+
+- `freebayes_parallel` subworkflow was moved to `cram_freebayes_parallel` local
+  subworkflow and was modified to deal with _total sample coverage_ and to work
+  with `*.cram` files
+- `picard/markduplicates` now works with `.*.cram` files
+- `bwa/mem` was configured to write files as `*.cram` files
+- `samtools/depth` was patched to write results with headers, with 0 coverage position
+  and to compress output with gzip
+- `resequencing-mem` workflow was modified in order to use local subworkflow, for
+  example to deal with `samtools` and `markduplicates`
+- fixed a issue when providing the `--genome_bwa_index` parameter
+- `snpeff_download` was patched in order to remove the `version` parameter
+- `snpeff/snpeff` module was patched to support custom database annotations and
+  to compress VCF output using a mulled image with `tabix`
+- the configuration file for MultiQC module was updated to simplify results, to order
+  them and to support all the supported modules
+
+### `Removed`
+
+- Remove `cnr-ibba/bamaddrg` module
+- Remove `cnr-ibba/freebayes/splitbam` module
+
 ## 0.5.2 - [2023-12-21]
 
 - Use MultiQC with all supported tools ([#53](https://github.com/cnr-ibba/nf-resequencing-mem/issues/53))

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,8 +10,6 @@
 
 ## Pipeline tools
 
-- [bamaddrg](https://github.com/ekg/bamaddrg)
-
 - [bcftools](https://samtools.github.io/bcftools/)
 
 - [bwa](http://bio-bwa.sourceforge.net/)
@@ -36,6 +34,10 @@
 
   > <https://doi.org/10.1371/journal.pone.0163962>
 
+- [SnpEff](https://pcingola.github.io/SnpEff/)
+
+  > "A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672
+
 - [tabix](https://www.htslib.org/doc/tabix.html)
 
 - [trimgalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) Paolo Cozzi <[email protected]>
+Copyright (c) Paolo Cozzi, Barbara Lazzari
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -2,7 +2,13 @@
 
 <!-- markdownlint-disable MD014 -->
 
-[![Nextflow](https://img.shields.io/badge/nextflow_DSL2-%E2%89%A521.10.6-green)](https://www.nextflow.io/)
+[![GitHub Actions CI Status](https://github.com/cnr-ibba/nf-resequencing-mem/actions/workflows/ci.yml/badge.svg)](https://github.com/cnr-ibba/nf-resequencing-mem/actions/workflows/ci.yml)
+[![GitHub Actions Linting Status](https://github.com/cnr-ibba/nf-resequencing-mem/actions/workflows/linting.yml/badge.svg)](https://github.com/cnr-ibba/nf-resequencing-mem/actions/workflows/linting.yml)
+[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)
+
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
+[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
+[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
 
 ## Overview
 
@@ -31,10 +37,6 @@ nextflow manual, and lets nextflow to download and execute the pipeline, for exa
 nextflow pull cnr-ibba/nf-resequencing-mem
 ```
 
-You will need also to define your credentials for private
-repositories. See [SCM configuration file](https://www.nextflow.io/docs/latest/sharing.html#scm-configuration-file)
-for more details.
-
 ## Customize configuration
 
 When running Nextflow, Nextflow looks for a file named `nextflow.config` in the
@@ -82,7 +84,7 @@ used to save _intermediate results_ or to skip a particular step:
 
 - `--genome_fasta_fai`: path to fasta index file (skip fasta index step)
 - `--genome_bwa_index`: path to genome bwa index directory (skip bwa index step)
-- `--save_bam`: (bool, def. false) save _markduplicated_ bam files with their indexes
+- `--save_cram`: (bool, def. false) save _markduplicated_ cram files with their indexes
   in results folder
 - `--save_trimmed`: (bool, def. false) save trimmed reads in results folder
 - `--save_fasta_index`: (bool, def. false) save fasta index (for reusing with this pipeline)
@@ -91,17 +93,47 @@ used to save _intermediate results_ or to skip a particular step:
 - `--remove_fastq_duplicates`: (bool, def. false) remove FASTQ duplicates by IDs
 - `--save_unique_fastq`: (bool, def. false) write de-duplicated FASTQ files (require
   `--remove_fastq_duplicates` option)
+- `--snpeff_database`: annotate the VCF file with SnpEff by providing a pre-built
+  database that can be found using the `java -jar snpEff.jar databases` command.
+  If the database is known to SnpEff will be downloaded and managed by the pipeline
+  itself
+- `--snpeff_cachedir`: SnpEff cache directory. It must contain a subdirectory with
+  the same name of `--snpeff_database`, with a valid SnpEff database as a content.
+  Is required when annotating with SnpEff with a custom database
+- `--snpeff_config`: SnpEff custom config file. Is required **only** with a custom
+  database. Needs to have the same custom database defined by `--snpeff_database` option
+  (see: [Building databases](https://pcingola.github.io/SnpEff/snpeff/build_db/)
+  of SnpEff documentation)
 
 You can have a list of available parameters by calling:
 
 ```bash
 nextflow run cnr-ibba/nf-resequencing-mem --help
 ```
 
-In addition, instead of passing parameters using CLI, you can create a custom configuration
-file and define each params in the _params scope_. Parameters have the same name
-used within the CLI, but without the `--` prefix. For example if you create a
-`custom.config` file like this
+In addition, instead of passing parameters using CLI, you can create a custom
+configuration file and define each params in the _params scope_. According
+nextflow _best practices_, parameters defined in _params scope_ should be
+placed in a _json_ file, for example `params.json`:
+
+```json
+{
+  "input": "<samplesheet.csv>",
+  "genome_fasta": "<genome_fasta>",
+  "outdir": "<results dir>",
+  "save_fasta_index": true
+}
+```
+
+Parameters have the same name used within the CLI, but without the `--` prefix.
+Nextflow can be called like this:
+
+```bash
+nextflow run cnr-ibba/nf-resequencing-mem -resume -profile <your profile> \
+  -params-file params.json
+```
+
+In alternative, you can create `custom.config` file like this:
 
 ```conf
 params {
@@ -119,6 +151,9 @@ nextflow run cnr-ibba/nf-resequencing-mem -resume -profile <your profile> \
   -config custom.config
 ```
 
+However, the custom configuration file should be used to specify all the parameters
+that can't be specified using the command line, for example custom arguments
+to be passed to a certain module using `ext.args` option.
 See Nextflow [Configuration](https://www.nextflow.io/docs/latest/config.html)
 documentation for more information.
 
@@ -297,17 +332,18 @@ process {
 
 The freebayes step can take a lot of time when calculating SNPs with a lot of data.
 This process is calculated using multiple processes by splitting the whole genome in
-regions (relying on BAM alignment sizes), and then by calling SNPs on each region
+regions (relying on CRAM alignment sizes), and then by calling SNPs on each region
 on a single process.
 In the last step, all results are collected and sorted to produce the final VCF file
-(see `subworkflows/cnr-ibba/freebayes_parallel.nf` subworkflow for more information).
-You can customize the region splitting, for example by using a smaller file size
-(def. is `100e6`) in the split process like this:
+(see `subworkflows/local/cram_freebayes_parallel` subworkflow for more information).
+You can customize the region splitting, for example by using a greater cumulative
+coverage (def. is `500_000_000`) or the minimum fragment length (def. is `10_000`)
+in the split process like this:
 
-```text
+```config
 process {
-    withName: FREEBAYES_SPLITBAM {
-        ext.args = '--target-data-size 10e6'
+    withName: FREEBAYES_SPLITCRAM {
+        ext.args = '--max_coverage 100_000_000 --min_length 20_000'
     }
 }
 ```
@@ -324,7 +360,7 @@ number. Nextflow can resubmit such process increasing the required resources at
 each step until `maxRetries` attempts are reached: you could increase the retry
 attempts like this:
 
-```text
+```config
 process {
     withName: FREEBAYES_CHUNK {
         maxRetries = 10
@@ -375,6 +411,22 @@ but an issue at _demultiplexing_ step: the only way to deal with this problem is
 to make rid of duplicated IDs using [seqkit/rmdup](https://bioinf.shenwei.me/seqkit/usage/#rmdup)
 by providing the `--remove_fastq_duplicates` option.
 
+### MarkDuplicates temporary files
+
+Markduplicates writes temporary files into `/tmp` partition by default. If your
+organization have a different location where temporary files should be stored
+(ex `/scratch` or any other `$TMP` position) and your jobs are running out of
+spaces, you should provide a different temporary location to `MarkDuplicates` steps,
+for example:
+
+```config
+process {
+    withName: PICARD_MARKDUPLICATES {
+        ext.args = '--TMP_DIR $TMPDIR'
+    }
+}
+```
+
 ## Acknowledgments
 
 This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE).

diff --git a/assets/NO_FILE b/assets/NO_FILE