-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from sidora-tools/r_package
turn pandora2eager into an R package.
- Loading branch information
Showing
13 changed files
with
229 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
^pandora2eager\.Rproj$ | ||
^\.Rproj\.user$ | ||
^LICENSE\.md$ | ||
.credentials | ||
pandora2eager.sif | ||
pandora2eager.sh | ||
p2e_singularity.def | ||
helptext.sh | ||
build_singularity_with_docker.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,7 @@ | ||
.credentials | ||
pandora2eager.sif | ||
.Rproj.user | ||
.Rhistory | ||
.Rdata | ||
.httr-oauth | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
Package: pandora2eager | ||
Title: Prepare an input TSV file for nf-core/eager with informations from Pandora | ||
Version: 0.2.2 | ||
Authors@R: | ||
person("Thiseas Christos", "Lamnidis", email = "[email protected]", role = c("aut", "cre"), | ||
comment = c(ORCID = "0000-0003-4485-8570")) | ||
Description: Provided with a list of Pandora Sequencing IDs, pandora2eager.R will pull Site, Individual, Library | ||
and Sequencing information from Pandora and create a TSV file with the information needed to run nf-core/eager | ||
on data from these Sequencing IDs. | ||
License: MIT + file LICENSE | ||
Encoding: UTF-8 | ||
Roxygen: list(markdown = TRUE) | ||
RoxygenNote: 7.1.2 | ||
Imports: | ||
stringr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
YEAR: 2021 | ||
COPYRIGHT HOLDER: THISEAS C. LAMNIDIS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# MIT License | ||
|
||
Copyright (c) 2021 pandora2eager authors | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(infer_color_chem) | ||
export(infer_library_specs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#' Infer colour chemistry from sequencer name | ||
#' | ||
#' @param x character. The sequencer name as it appears in Pandora. | ||
#' | ||
#' @return integer | ||
#' @export | ||
infer_color_chem <- function(x) { | ||
color_chem <- NULL | ||
if (x %in% c("K00233 (HiSeq4000)","D00829 (HiSeq2500)","M02279 (MiSeq1)", "M06210 (MiSeq2)")) { | ||
color_chem=4 | ||
} else if (x %in% c("NS500382 (Rosa)","NS500559 (Tosca)" )) { | ||
color_chem=2 | ||
} else if (x %in% c("MinIon 1", "MinIon 2", "MinIon HKI")) { | ||
color_chem=NA | ||
message("MinIon sequencing does not have color chemistry. Set to NA.") | ||
} else { | ||
message("Color chemistry inference was not successful. Uninferred color chemistries set to 'Unknown'. Contact: [email protected].") | ||
color_chem="Unknown" | ||
} | ||
return(as.integer(color_chem)) | ||
} | ||
|
||
#' Infer strandedness and udg_treatment from protocol number | ||
#' | ||
#' @param x character. The libary protocol as it appears in Pandora. | ||
#' | ||
#' @return character vector | ||
#' @export | ||
infer_library_specs <- function(x) { | ||
udg_treatment <- NULL | ||
strandedness <- NULL | ||
words <- stringr::str_split(x, " " , simplify = T) | ||
## ssLib non-UDG | ||
if ((words[,1] == "ssLibrary" || words[,1] == "SsLibrary") && utils::tail(words[1,],1) == "2018") { | ||
strandedness = "single" | ||
udg_treatment = "none" | ||
|
||
## ssLib Unknown UDG | ||
} else if (words[,1] == "ssLibrary" && utils::tail(words[1,],1) == "EVA") { | ||
message("Inference of UDG treatment failed for protocol '",x,"'. Setting to 'Unknown'. | ||
You will need to fill in this information manually, since this protocol could refer to either UDG treatment. | ||
") | ||
strandedness = "single" | ||
udg_treatment = "Unknown" | ||
|
||
## ssLib automated non-UDG Leipzig | ||
} else if (words[,1] == "Automated_ss_library_preparation_noUDG_EVA_CoreUnit") { | ||
strandedness = "single" | ||
udg_treatment = "none" | ||
|
||
## ssLib automated half-UDG Leipzig | ||
} else if (words[,1] == "Automated_ss_library_preparation_partialUDG_EVA_CoreUnit") { | ||
strandedness = "single" | ||
udg_treatment = "half" | ||
|
||
## External | ||
} else if (words[,1] %in% c("Extern", "External")) { | ||
strandedness = "Unknown" | ||
udg_treatment = "Unknown" | ||
message("Cannot infer strandedness and UDG treatment for external libraries. Setting both to \"Unknown\".") | ||
|
||
## Modern DNA | ||
} else if (words[,1] == "Illumina") { | ||
strandedness = "double" | ||
udg_treatment = "none" | ||
|
||
## dsLib | ||
} else if (words[,1] == "dsLibrary") { | ||
strandedness = "double" | ||
|
||
## Non UDG | ||
if (words[,3] == "UDG" ) { | ||
udg_treatment = "none" | ||
|
||
## Half UDG | ||
} else if (words[,3] == "half") { | ||
udg_treatment = "half" | ||
|
||
## Full UDG | ||
} else if (words[,3] == "full") { | ||
udg_treatment = "full" | ||
} | ||
|
||
## Blanks | ||
} else if (words[,1] == "Capture") { | ||
udg_treatment = "none" | ||
strandedness = "double" | ||
|
||
## Inference failed? | ||
} else { | ||
message("Inference of strandedness and UDG treatment failed for library protocol '",x,"'. Setting both fields to 'Unknown'. Please fill in this informations manually. | ||
Contact [email protected] if you think the library protocol stated could be automatically inferred. | ||
") | ||
udg_treatment = "Unknown" | ||
strandedness = "Unknown" | ||
} | ||
return(c(strandedness, udg_treatment)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,95 +8,7 @@ library(dplyr, warn.conflicts = F) | |
library(readr) | ||
library(tidyr) | ||
library(stringr) | ||
|
||
## Infer colour chemistry from sequencer name | ||
infer_color_chem <- function(x) { | ||
color_chem <- NULL | ||
if (x %in% c("K00233 (HiSeq4000)","D00829 (HiSeq2500)","M02279 (MiSeq1)", "M06210 (MiSeq2)")) { | ||
color_chem=4 | ||
} else if (x %in% c("NS500382 (Rosa)","NS500559 (Tosca)" )) { | ||
color_chem=2 | ||
} else if (x %in% c("MinIon 1", "MinIon 2", "MinIon HKI")) { | ||
color_chem=NA | ||
message("MinIon sequencing does not have color chemistry. Set to NA.") | ||
} else { | ||
message("Color chemistry inference was not successful. Uninferred color chemistries set to 'Unknown'. Contact: [email protected].") | ||
color_chem="Unknown" | ||
} | ||
return(as.integer(color_chem)) | ||
} | ||
|
||
## Infer strandedness and udg_treatment from protocol number | ||
infer_library_specs <- function(x) { | ||
udg_treatment <- NULL | ||
strandedness <- NULL | ||
words <- str_split(x, " " , simplify = T) | ||
## ssLib non-UDG | ||
if ((words[,1] == "ssLibrary" || words[,1] == "SsLibrary") && tail(words[1,],1) == "2018") { | ||
strandedness = "single" | ||
udg_treatment = "none" | ||
|
||
## ssLib Unknown UDG | ||
} else if (words[,1] == "ssLibrary" && tail(words[1,],1) == "EVA") { | ||
message("Inference of UDG treatment failed for protocol '",x,"'. Setting to 'Unknown'. | ||
You will need to fill in this information manually, since this protocol could refer to either UDG treatment. | ||
") | ||
strandedness = "single" | ||
udg_treatment = "Unknown" | ||
|
||
## ssLib automated non-UDG Leipzig | ||
} else if (words[,1] == "Automated_ss_library_preparation_noUDG_EVA_CoreUnit") { | ||
strandedness = "single" | ||
udg_treatment = "none" | ||
|
||
## ssLib automated half-UDG Leipzig | ||
} else if (words[,1] == "Automated_ss_library_preparation_partialUDG_EVA_CoreUnit") { | ||
strandedness = "single" | ||
udg_treatment = "half" | ||
|
||
## External | ||
} else if (words[,1] %in% c("Extern", "External")) { | ||
strandedness = "Unknown" | ||
udg_treatment = "Unknown" | ||
message("Cannot infer strandedness and UDG treatment for external libraries. Setting both to \"Unknown\".") | ||
|
||
## Modern DNA | ||
} else if (words[,1] == "Illumina") { | ||
strandedness = "double" | ||
udg_treatment = "none" | ||
|
||
## dsLib | ||
} else if (words[,1] == "dsLibrary") { | ||
strandedness = "double" | ||
|
||
## Non UDG | ||
if (words[,3] == "UDG" ) { | ||
udg_treatment = "none" | ||
|
||
## Half UDG | ||
} else if (words[,3] == "half") { | ||
udg_treatment = "half" | ||
|
||
## Full UDG | ||
} else if (words[,3] == "full") { | ||
udg_treatment = "full" | ||
} | ||
|
||
## Blanks | ||
} else if (words[,1] == "Capture") { | ||
udg_treatment = "none" | ||
strandedness = "double" | ||
|
||
## Inference failed? | ||
} else { | ||
message("Inference of strandedness and UDG treatment failed for library protocol '",x,"'. Setting both fields to 'Unknown'. Please fill in this informations manually. | ||
Contact [email protected] if you think the library protocol stated could be automatically inferred. | ||
") | ||
udg_treatment = "Unknown" | ||
strandedness = "Unknown" | ||
} | ||
return(c(strandedness, udg_treatment)) | ||
} | ||
library(pandora2eager) | ||
|
||
## Main function that queries pandora, formats info and spits out a table with the necessary information for eager. | ||
collect_and_format_info<- function(query_list_seq, con) { | ||
|
@@ -108,12 +20,12 @@ collect_and_format_info<- function(query_list_seq, con) { | |
)), con = con | ||
) | ||
) | ||
|
||
## Get tabs of Organisms, Protocols and Sequencer names | ||
df_list <- get_df_list( | ||
c("TAB_Organism", "TAB_Protocol", "TAB_Sequencing_Sequencer"),con | ||
) | ||
|
||
results <- inner_join(complete_pandora_table, query_list_seq, by=c("sequencing.Full_Sequencing_Id"="Sequencing")) %>% | ||
select(library.Full_Library_Id, capture.Full_Capture_Id, sequencing.Sequencer, sequencing.Sequencing_Id, individual.Full_Individual_Id, library.Protocol, individual.Organism, raw_data.FastQ_Files) %>% | ||
## Infer protocol and Organism names from Pandora indexes | ||
|
@@ -122,10 +34,10 @@ collect_and_format_info<- function(query_list_seq, con) { | |
Sequencer=df_list[["TAB_Sequencing_Sequencer"]][["sequencer.Name"]][`sequencing.Sequencer`]) %>% | ||
## Infer SE/PE sequencing from number of FastQs per lane. | ||
mutate( | ||
num_fq=map_int(`raw_data.FastQ_Files`, function(fq) {ncol(str_split(fq, " ", simplify = T))}), | ||
num_fq=map_int(`raw_data.FastQ_Files`, function(fq) {ncol(str_split(fq, " ", simplify = T))}), | ||
num_r1=map(`raw_data.FastQ_Files`, function(fq) {sum(grepl("_R1_",str_split(fq, " ", simplify = T)))}), | ||
SeqType=ifelse(num_fq == num_r1, "SE", "PE")) %>% | ||
select(-starts_with("num_")) %>% | ||
select(-starts_with("num_")) %>% | ||
## Make R1 and R2 columns out of the FastQ file(s) | ||
mutate(`raw_data.FastQ_Files`=map(`raw_data.FastQ_Files`, function(fq) {str_replace_all(fq, " ([[:graph:]]*_R2_.{3}.fastq.gz)", paste0(";","\\1"))})) %>% | ||
separate_rows(`raw_data.FastQ_Files`, sep=" ") %>% | ||
|
@@ -134,13 +46,13 @@ collect_and_format_info<- function(query_list_seq, con) { | |
mutate( | ||
## Eager cannot handle same lane number for same Library_Id. Therefore lane number for additional sequencing needs to be | ||
## artificially inflated (by 8 which is the max lane number in our sequencers). This approach has the advantage that the output | ||
## for a given sequencing ID will be consistent and not dependent on the specific input file passed to this script. | ||
Lane=as.integer(str_replace(`R1`,"[[:graph:]]*_L([[:digit:]]{3})_R[[:graph:]]*", "\\1"))+8*(sequencing.Sequencing_Id-1), | ||
## for a given sequencing ID will be consistent and not dependent on the specific input file passed to this script. | ||
Lane=as.integer(str_replace(`R1`,"[[:graph:]]*_L([[:digit:]]{3})_R[[:graph:]]*", "\\1"))+8*(sequencing.Sequencing_Id-1), | ||
## Library Strandedness and UDG Treatment from protocol name | ||
Strandedness=map_chr(`Protocol`, function (.) {infer_library_specs(.)[1]}), | ||
UDG_Treatment=map_chr(`Protocol`, function(.){infer_library_specs(.)[2]}), | ||
Strandedness=map_chr(`Protocol`, function (.) {pandora2eager::infer_library_specs(.)[1]}), | ||
UDG_Treatment=map_chr(`Protocol`, function(.){pandora2eager::infer_library_specs(.)[2]}), | ||
## Colour Chemistry from sequencer name | ||
Colour_Chemistry=map_int(`Sequencer`, infer_color_chem), | ||
Colour_Chemistry=map_int(`Sequencer`, pandora2eager::infer_color_chem), | ||
## BAM column always set to NA | ||
BAM=NA | ||
) %>% | ||
|
@@ -168,15 +80,15 @@ if (!is.na(args[3]) && args[3] == "--debug") { | |
write_tsv(results, "Debug_table.txt") | ||
} else if (!is.na(args[3]) && ( args[3] == "--rename" || args[3] == "-r")) { | ||
cat( | ||
format_tsv(results %>% | ||
format_tsv(results %>% | ||
mutate(Library_ID=str_replace_all(Library_ID, "[.]", "_")) %>% ## Replace dots in the Library_ID to underscores. | ||
select(Sample_Name, Library_ID, Lane, Colour_Chemistry, | ||
select(Sample_Name, Library_ID, Lane, Colour_Chemistry, | ||
SeqType, Organism, Strandedness, UDG_Treatment, R1, R2, BAM)) | ||
) | ||
} else { | ||
cat( | ||
format_tsv(results %>% | ||
select(Sample_Name, Library_ID, Lane, Colour_Chemistry, | ||
format_tsv(results %>% | ||
select(Sample_Name, Library_ID, Lane, Colour_Chemistry, | ||
SeqType, Organism, Strandedness, UDG_Treatment, R1, R2, BAM)) | ||
) | ||
} | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.