Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a draft for fasta_ltrretriever_lai #4984

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
include { LTRHARVEST } from '../../../modules/nf-core/ltrharvest/main'
include { LTRFINDER } from '../../../modules/nf-core/ltrfinder/main'
include { LTRRETRIEVER_LTRRETRIEVER } from '../../../modules/nf-core/ltrretriever/ltrretriever/main'
include { CAT_CAT } from '../../../modules/nf-core/cat/cat/main'
include { LTRRETRIEVER_LAI } from '../../../modules/nf-core/ltrretriever/lai/main'

workflow FASTA_LTRHARVEST_LTRFINDER_LTRRETRIEVER_LAI {

take:
ch_fasta // channel: [ val(meta), fasta ]
ch_monoploid_seqs // channel: [ val(meta2), txt ]; Optional: Set to [] if not needed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than use [] for nothing, I think it would better to say use Channel.empty(). This might simplify some logic downstream

// val(meta) from ch_fasta and val(meta2) from ch_monoploid_seqs are
// only required to have the same `id`
skip_lai // val(true|false)

main:
ch_versions = Channel.empty()

// Prapre input channels
ch_monoploid_seqs_plain = ( ch_monoploid_seqs ?: Channel.empty() )
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming Channel.empty() as input rather than [] would mean this ternary statement isn't needed.

| filter { meta2, seqs -> seqs }
// Cater to channel: [ meta2, [] ]
| map { meta2, seqs -> [ meta2.id, seqs ] }


// collectFile: Shorten IDs
ch_shortidstsv_orig_fasta = ch_fasta
| map { meta, fasta -> shorten_ids ( meta, fasta ) }
| collectFile
| map { tsv ->
[ tsv.baseName.replace('.short.ids', ''), tsv ]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does .short.ids get removed here rather than naming the file without the .short.ids in the first place?

}
| join(
ch_fasta.map { meta, fasta -> [ meta.id, meta, fasta ] }
)
| map { id, tsv, meta, fasta -> [ meta, tsv, fasta ] }

ch_short_ids_tsv = ch_shortidstsv_orig_fasta
| map {meta, tsv, fasta -> [ meta, tsv ] }

ch_shortidstsv_orig_fasta_branch= ch_shortidstsv_orig_fasta
| branch { meta, tsv, fasta ->
change: ! tsv.text.contains('IDs have acceptable length and character')
nonchange: tsv.text.contains('IDs have acceptable length and character')
}

// collectFile: Create Fasta file with short IDs
ch_short_ids_fasta = ch_shortidstsv_orig_fasta_branch.change
| map { meta, tsv, fasta ->
def orig_to_new_ids = tsv.text.tokenize('\n').collectEntries { [ it.tokenize('\t')[0], it.tokenize('\t')[1] ] }

def fasta_records = fasta.splitFasta( record: [ id: true, sequence: true ] )
def new_fasta = fasta_records.collect { ">${orig_to_new_ids[it.id]}\n${it.sequence}" }.join('')

[ "${meta.id}.short.ids.fasta", new_fasta ]
}
| collectFile
| map { fasta -> [ fasta.baseName.replace('.short.ids', ''), fasta ] }
| join(
ch_shortidstsv_orig_fasta_branch.change
| map { meta, tsv, fasta -> [ meta.id, meta ] }
)
| map { id, fasta, meta -> [ meta, fasta ] }
| mix(
ch_shortidstsv_orig_fasta_branch.nonchange.map { meta, tsv, fasta -> [ meta, fasta ] }
)

// collectFile: Map monoploid seqs to short IDs
ch_short_monoploid_seqs = ch_short_ids_tsv
| map { meta, tsv -> [ meta.id, tsv ] }
| join(ch_monoploid_seqs_plain)
| map { id, tsv, seqs ->
map_monoploid_seqs_to_new_ids(id, tsv, seqs)
}
| collectFile(newLine:true)
| map { seqs ->
def id = seqs.name.split('.mapped.monoploid.seqs.txt')[0]

[ id, seqs ]
}
| join(
ch_short_ids_tsv
| map { meta, tsv -> [ meta.id, meta ] }
)
| map { id, seqs, meta -> [ meta, seqs ] }


// MODULE: LTRHARVEST
LTRHARVEST ( ch_short_ids_fasta )

ch_ltrharvest_scn = LTRHARVEST.out.scn
ch_versions = ch_versions.mix(LTRHARVEST.out.versions.first())

// MODULE: LTRFINDER
LTRFINDER ( ch_short_ids_fasta )

ch_ltrfinder_scn = LTRFINDER.out.scn
ch_versions = ch_versions.mix(LTRFINDER.out.versions.first())

// MODULE: CAT_CAT
ch_cat_cat_inputs = ch_ltrharvest_scn
| join(ch_ltrfinder_scn)
| map { meta, harvested, found -> [ meta, [ harvested, found ] ] }

CAT_CAT ( ch_cat_cat_inputs )

ch_ltr_candidates = CAT_CAT.out.file_out
ch_versions = ch_versions.mix(CAT_CAT.out.versions.first())

// MODULE: LTRRETRIEVER_LTRRETRIEVER
ch_ltrretriever_inputs = ch_short_ids_fasta.join(ch_ltr_candidates)

LTRRETRIEVER_LTRRETRIEVER (
ch_ltrretriever_inputs.map { meta, fasta, ltr -> [ meta, fasta ] },
ch_ltrretriever_inputs.map { meta, fasta, ltr -> ltr },
[],
[],
[]
)

ch_pass_list = LTRRETRIEVER_LTRRETRIEVER.out.pass_list
ch_annotation_out = LTRRETRIEVER_LTRRETRIEVER.out.annotation_out
ch_pass_out = ch_pass_list.join(ch_annotation_out)
ch_annotation_gff = LTRRETRIEVER_LTRRETRIEVER.out.annotation_gff
ch_ltrlib = LTRRETRIEVER_LTRRETRIEVER.out.ltrlib
ch_versions = ch_versions.mix(LTRRETRIEVER_LTRRETRIEVER.out.versions.first())

// MODULE: LTRRETRIEVER_LAI
ch_short_ids_fasta_mono = ch_short_ids_fasta
| join(
ch_short_monoploid_seqs,
by:0,
remainder: true
)
// Danger! This partial join can fail
| filter { meta, fasta, seqs -> fasta }
// This filter safeguards against fail on upstream
// process failure: https://github.com/nextflow-io/nextflow/issues/5043
// fasta may come from upstream processes
// seqs also comes from upstream processes, it is optional
// and may not be present for some of the combinations
| map { meta, fasta, seqs -> [ meta, fasta, seqs ?: [] ] }

ch_lai_inputs = skip_lai
? Channel.empty()
: ch_short_ids_fasta_mono
| join(
ch_pass_out
)
| map { meta, fasta, seqs, pass, out ->
[ meta, fasta, pass, out, seqs ]
}
LTRRETRIEVER_LAI(
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> [ meta, fasta ] },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> pass },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> out },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> seqs }
)

ch_lai_log = LTRRETRIEVER_LAI.out.log
ch_lai_out = LTRRETRIEVER_LAI.out.lai_out
ch_versions = ch_versions.mix(LTRRETRIEVER_LAI.out.versions.first())

// collectFile: Restore gff ids
ch_gff_tsv_branch = ch_annotation_gff.join(ch_short_ids_tsv)
| branch { meta, gff, tsv ->
change: ! tsv.text.contains('IDs have acceptable length and character')
nochange: tsv.text.contains('IDs have acceptable length and character')
}

ch_collected_gff = ch_gff_tsv_branch.change
| map { meta, gff, tsv ->
restore_gff_ids ( meta.id, gff, tsv )
}
| collectFile
| map { gff ->
def id = gff.name.split('.restored.ids.gff3')[0]

[ id, gff ]
}
| join(
ch_gff_tsv_branch.change
| map { meta, gff, tsv -> [ meta.id, meta ] }
)
| map { id, gff, meta -> [ meta, gff ] }


ch_restored_gff = ch_gff_tsv_branch.nochange
| map { meta, gff, tsv -> [ meta, gff ] }
| mix(ch_collected_gff)

emit:
ltrlib = ch_ltrlib // channel: [ val(meta), fasta ]
annotation_gff = ch_restored_gff // channel: [ val(meta), gff ]
lai_log = ch_lai_log // channel: [ val(meta), log ]
lai_out = ch_lai_out // channel: [ val(meta), out ]
versions = ch_versions // channel: [ versions.yml ]
}


def map_monoploid_seqs_to_new_ids(id, short_ids_tsv, monoploid_seqs) {

def short_ids_head = short_ids_tsv.text.tokenize('\n')[0]

if (short_ids_head == "IDs have acceptable length and character. No change required.") {
return [ "${id}.mapped.monoploid.seqs.txt" ] + monoploid_seqs.text.tokenize('\n')
}

def orig_to_new_ids = [:]
short_ids_tsv.text.eachLine { line ->
def (original_id, renamed_id) = line.tokenize('\t')
orig_to_new_ids[original_id] = renamed_id
}

def mapped_ids = []
monoploid_seqs.text.eachLine { original_id ->
if (!orig_to_new_ids[original_id]) {
error "Faild to find $original_id in ${short_ids_tsv}" +
"\nThe short_ids_tsv file is malformed!"
}

mapped_ids.add(orig_to_new_ids[original_id])
}

return [ "${id}.mapped.monoploid.seqs.txt" ] + mapped_ids
}


def restore_gff_ids(id, file_gff, short_ids_tsv) {
def new_to_orig_ids = [:]

short_ids_tsv.text.eachLine { line ->
def columns = line.tokenize('\t')

def orig_id = columns[0]
def new_id = columns[1]
new_to_orig_ids[new_id] = orig_id
}

def restored_gff = []

file_gff.text.eachLine { line ->
if ( line.startsWith('##') ) {
restored_gff.add(line)
return
}

def columns = line.tokenize('\t')
def new_id = columns[0]
def orig_id = new_to_orig_ids[new_id]

restored_gff.add ( ( [ orig_id ] + columns[1..-1] ).join('\t') )
}

[ "${id}.restored.ids.gff3", restored_gff.join('\n') ]

}


def shorten_ids(meta, fasta) {

def ids_descs = fasta.splitFasta( record: [ id: true, desc: true ] )
def ids = ids_descs.collect { it.id }

if ( ids_descs.every { it.id.size() <= 13 && it.id ==~ /\w+/ && it.desc == null } ) {
return [ "${meta.id}.short.ids.tsv", 'IDs have acceptable length and character. No change required.' ]
}

def shortened_ids = ids.collect { id -> ( id.size() <= 13 && id ==~ /\w+/ ) ? [ id, id ] : [ id, "Ctg${id.md5()[0..<10]}" ] }

if ( shortened_ids.size() != shortened_ids.collect { it[1] }.unique().size() ) {
error "Failed to create a unique set of Fasta IDs for sample ${meta.id}. Manual shortening of the Fasta IDs is required."
} // The probability of this condition is very low but it is here to cover the edge cases
// It assumes that the input fasta is valid and all the sequences have unique IDs

[ "${meta.id}.short.ids.tsv", shortened_ids.collect { line -> line.join('\t') }.join('\n') ]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "fasta_ltrharvest_ltrfinder_ltrretriever_lai"
description: |
Performs long terminal retrotransposons (LTR) annotation with gt/ltrharvest, ltrfinder
and ltrretriever along with LTR assembly index (LAI).
keywords:
- genomics
- genome
- annotation
- repeat
- transposons
- retrotransposons
- stats
- qc
components:
- ltrharvest
- ltrfinder
- ltrretriever/ltrretriever
- cat/cat
- ltrretriever/lai
input:
- ch_fasta:
type: file
description: |
Channel for the assembly fasta file
Structure: [ val(meta), path(fasta) ]
pattern: "*.{fsa/fa/fasta}"
- ch_monoploid_seqs:
type: file
description: |
Channel for providing a list of monoploid sequences
for correct estimation of LAI for polyploid genomes.
This parameter is useful when all the haplotypes are
stored in a single fasta file.
Structure: [ val(meta), path(txt) ]
pattern: "*.txt"
- skip_lai:
type: boolean
description: |
Skip LAI estimation
Structure: [ val(boolean) ]
output:
- ltrlib:
type: file
description: |
LTR library constructed by ltrretriever
Structure: [ val(meta), path(fasta) ]
pattern: "*.LTRlib.fa"
- annotation_gff:
type: file
description: |
Annotation based on the LTR library
Structure: [ val(meta), path(gff3) ]
pattern: "*.gff3"
- lai_log:
type: file
description: |
Log from LAI
Structure: [ val(meta), path(log) ]
pattern: "*.LAI.log"
- lai_out:
type: file
description: |
LAI output
Structure: [ val(meta), path(out) ]
pattern: "*.LAI.out"
- versions:
type: file
description: |
File containing software versions
Structure: [ path(versions.yml) ]
pattern: "versions.yml"
authors:
- "@GallVp"
maintainers:
- "@GallVp"
Loading
Loading