Skip to content

Commit

Permalink
🔀 Merge pull request #74 from cnr-ibba/issue-73
Browse files Browse the repository at this point in the history
⚡ Optimize `samtools/depth` step
  • Loading branch information
bunop authored Apr 11, 2024
2 parents 055c09d + 0b84168 commit d41676e
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 53 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"coverage",
"cpus",
"demultiplexing",
"depthfile",
"downsampling",
"fasta",
"fastq",
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 0.6.1 - [2024-04-11]

- Calculate `samtools/depth` on each chromosomes ([#73](https://github.com/cnr-ibba/nf-resequencing-mem/issues/73))

### `Fixed`

- Passing args to `modules/local/freebayes_splitcram`
- Calculate `samtools/depth` without _0 coverage_ regions

## 0.6.0 - [2024-04-04]

- Replace `*.bam` file format with `*.cram` ([#9](https://github.com/cnr-ibba/nf-resequencing-mem/issues/9))
Expand Down
4 changes: 2 additions & 2 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
report_comment: >
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.0" target="_blank">nf-resequencing-mem</a>
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.1" target="_blank">nf-resequencing-mem</a>
analysis pipeline. For information about how to interpret these results, please see the
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.0/README.md" target="_blank">documentation</a>.
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.1/README.md" target="_blank">documentation</a>.
report_section_order:
"nf-resequencing-mem-methods-description":
order: -1000
Expand Down
80 changes: 49 additions & 31 deletions bin/split_ref_by_depth.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@
# -*- coding: utf-8 -*-

"""
This script will write a list of region from a reference file relying on
coverage depth
coverage depth. Is supposed to work on a single chromosome at a time, it
requires chromosome length in order to work on sample depth with no 0 regions
Author: Paolo Cozzi
Date: 2024/03/21
Date: 2024/04/11
Usage:
python split_ref_by_depth.py --depth_file <depth_file>
python split_ref_by_depth.py --depth_file <depth_file> \
--chromosome <chromosome> --chromosome_length <chromosome_length> \
[--min_length <min_length>] [--max_coverage <max_coverage>] \
[--overlap_size <overlap_size>] [--verbose <verbose>] [--quiet <quiet>]
Arguments:
depth_file: The path of samtools depth output file
Expand Down Expand Up @@ -126,49 +131,40 @@ def append_or_extend_region(


def split_ref_by_coverage(
depthfile: str, max_coverage: int, min_length: int, overlap_size: int):
depthfile: str, chromosome: str, chromosome_length: int,
max_coverage: int, min_length: int, overlap_size: int):
with gzip.open(depthfile, "rt") as handle:
reader = csv.reader(handle, delimiter="\t", lineterminator="\n")
header = next(reader)

# header: ["#CHROM", "POS", "Sample_1.bam", "Sample_2.bam", ...]
logging.debug(f"Got header: {header}")

# Inizialize some variables
line = next(reader)
start_chrom = line[0]
start_pos = int(line[1])
old_pos = start_pos
cumulative_coverage = 0
# test if I have reads aligned in this file
try:
line = next(reader)

except StopIteration:
logging.error(
f"File '{depthfile}' has no coverage data"
)
return [[chromosome, 0, chromosome_length]]

# Initialize some variables
regions = []
start_chrom = chromosome
start_pos = 1
old_pos = start_pos
cumulative_coverage = 0

logging.debug(f"Starting from chrom: '{start_chrom}'")

for i, line in enumerate(reader):
chrom, pos = line[0], int(line[1])

if chrom != start_chrom:
logging.debug(f"{i}: Got a new chromosome '{chrom}'")
logging.debug(
f"{i}: Test for a new region with: "
f"{[start_chrom, start_pos, old_pos]}"
f" ({old_pos-start_pos+1} bp; "
f"{cumulative_coverage:.2e} cumulative coverage)"
)

# add and open a new region
regions = append_or_extend_region(
regions,
[start_chrom, start_pos, old_pos],
min_length,
overlap_size
)

# reset variables
start_chrom = chrom
start_pos = pos
cumulative_coverage = 0
logging.critical(f"{i}: Got a new chromosome '{chrom}'")
raise Exception("This script works on a single chromosome at a time")

# determine region size
length = pos - start_pos
Expand Down Expand Up @@ -219,6 +215,14 @@ def split_ref_by_coverage(
overlap_size
)

# check for last region end
last_region = regions[-1]
if last_region[2] < chromosome_length:
logging.debug(
f"Extending the last region: {last_region} to {chromosome_length}"
)
last_region[2] = chromosome_length

return regions


Expand All @@ -232,6 +236,14 @@ def split_ref_by_coverage(
'-d', '--depth_file', required=True,
help="The output of samtools depth file for all samples"
)
parser.add_argument(
'-c', '--chromosome', required=True, type=str,
help="The chromosome name"
)
parser.add_argument(
'-l', '--chromosome_length', required=True, type=int,
help="The length of the chromosome"
)
parser.add_argument(
"--min_length", default=100_000, type=int,
help="minimum fragment length in bp"
Expand Down Expand Up @@ -263,7 +275,13 @@ def split_ref_by_coverage(

# split reference by coverage depth
regions = split_ref_by_coverage(
args.depth_file, args.max_coverage, args.min_length, args.overlap_size)
args.depth_file,
args.chromosome,
args.chromosome_length,
args.max_coverage,
args.min_length,
args.overlap_size
)

logging.info(f"Number of regions: {len(regions)}")

Expand Down
6 changes: 3 additions & 3 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ process {
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
time = { check_max( 12.h * task.attempt, 'time' ) }
}
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
Expand Down
6 changes: 5 additions & 1 deletion modules/local/freebayes_splitcram.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ process FREEBAYES_SPLITCRAM {
def prefix = task.ext.prefix ?: "${meta.id}"
"""
split_ref_by_depth.py \\
--depth_file ${depth} > ${prefix}.regions.txt
${args} \\
--depth_file ${depth} \\
--chromosome ${meta.id} \\
--chromosome_length ${meta.length} \\
> ${prefix}.regions.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
11 changes: 6 additions & 5 deletions modules/nf-core/samtools/depth/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 15 additions & 7 deletions modules/nf-core/samtools/depth/samtools-depth.diff

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ manifest {
description = 'Nextflow Resequencing with BWA MEM'
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '0.6.0'
version = '0.6.1'
}

// Load modules.config for DSL2 module specific options
Expand Down
22 changes: 19 additions & 3 deletions subworkflows/local/cram_freebayes_parallel/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ workflow CRAM_FREEBAYES_PARALLEL {
main:
ch_versions = Channel.empty()

// calculate total coverage depth for all samples
SAMTOOLS_DEPTH( bam, [[], []] )
// read chromosome list from fasta index
chromosome_ch = fai.map{ it -> it[1] }
.splitCsv(header: false, sep: '\t')
.map{ row -> [[id: row[0], length: row[1]], row[0]] }
// .view()

// calculate total coverage depth for all samples by chromosome
// bam is a value channel; chromosome_ch is a queue channel
// for every chromosome
SAMTOOLS_DEPTH( bam, bai, chromosome_ch )
ch_versions = ch_versions.mix(SAMTOOLS_DEPTH.out.versions)

// split fasta in chunks relying BAM size
Expand All @@ -33,7 +41,15 @@ workflow CRAM_FREEBAYES_PARALLEL {
.map{ it -> [[id: it.trim()], it.trim()]}

// call freebayes on each region
FREEBAYES_CHUNK ( regions_ch, bam, bai, SAMTOOLS_DEPTH.out.bam_list, fasta, fai )
FREEBAYES_CHUNK (
regions_ch,
bam,
bai,
// converting to a value channel
SAMTOOLS_DEPTH.out.bam_list.first(),
fasta,
fai
)
ch_versions = ch_versions.mix(FREEBAYES_CHUNK.out.versions)

// merge freebayes chunks
Expand Down

0 comments on commit d41676e

Please sign in to comment.