Skip to content

Commit

Permalink
Merge pull request #3 from sidora-tools/r_package
Browse files Browse the repository at this point in the history
turn pandora2eager into an R package.
  • Loading branch information
TCLamnidis authored Oct 28, 2021
2 parents adf3f03 + 4c7fbf6 commit 247cb5b
Show file tree
Hide file tree
Showing 13 changed files with 229 additions and 107 deletions.
9 changes: 9 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
^pandora2eager\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
.credentials
pandora2eager.sif
pandora2eager.sh
p2e_singularity.def
helptext.sh
build_singularity_with_docker.sh
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
.credentials
pandora2eager.sif
.Rproj.user
.Rhistory
.Rdata
.httr-oauth
.DS_Store
15 changes: 15 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Package: pandora2eager
Title: Prepare an input TSV file for nf-core/eager with informations from Pandora
Version: 0.2.2
Authors@R:
person("Thiseas Christos", "Lamnidis", email = "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4485-8570"))
Description: Provided with a list of Pandora Sequencing IDs, pandora2eager.R will pull Site, Individual, Library
and Sequencing information from Pandora and create a TSV file with the information needed to run nf-core/eager
on data from these Sequencing IDs.
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.2
Imports:
stringr
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2021
COPYRIGHT HOLDER: THISEAS C. LAMNIDIS
21 changes: 21 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

Copyright (c) 2021 pandora2eager authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Generated by roxygen2: do not edit by hand

export(infer_color_chem)
export(infer_library_specs)
98 changes: 98 additions & 0 deletions R/infer_columns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#' Infer colour chemistry from sequencer name
#'
#' @param x character. The sequencer name as it appears in Pandora.
#'
#' @return integer
#' @export
infer_color_chem <- function(x) {
color_chem <- NULL
if (x %in% c("K00233 (HiSeq4000)","D00829 (HiSeq2500)","M02279 (MiSeq1)", "M06210 (MiSeq2)")) {
color_chem=4
} else if (x %in% c("NS500382 (Rosa)","NS500559 (Tosca)" )) {
color_chem=2
} else if (x %in% c("MinIon 1", "MinIon 2", "MinIon HKI")) {
color_chem=NA
message("MinIon sequencing does not have color chemistry. Set to NA.")
} else {
message("Color chemistry inference was not successful. Uninferred color chemistries set to 'Unknown'. Contact: [email protected].")
color_chem="Unknown"
}
return(as.integer(color_chem))
}

#' Infer strandedness and udg_treatment from protocol number
#'
#' @param x character. The libary protocol as it appears in Pandora.
#'
#' @return character vector
#' @export
infer_library_specs <- function(x) {
udg_treatment <- NULL
strandedness <- NULL
words <- stringr::str_split(x, " " , simplify = T)
## ssLib non-UDG
if ((words[,1] == "ssLibrary" || words[,1] == "SsLibrary") && utils::tail(words[1,],1) == "2018") {
strandedness = "single"
udg_treatment = "none"

## ssLib Unknown UDG
} else if (words[,1] == "ssLibrary" && utils::tail(words[1,],1) == "EVA") {
message("Inference of UDG treatment failed for protocol '",x,"'. Setting to 'Unknown'.
You will need to fill in this information manually, since this protocol could refer to either UDG treatment.
")
strandedness = "single"
udg_treatment = "Unknown"

## ssLib automated non-UDG Leipzig
} else if (words[,1] == "Automated_ss_library_preparation_noUDG_EVA_CoreUnit") {
strandedness = "single"
udg_treatment = "none"

## ssLib automated half-UDG Leipzig
} else if (words[,1] == "Automated_ss_library_preparation_partialUDG_EVA_CoreUnit") {
strandedness = "single"
udg_treatment = "half"

## External
} else if (words[,1] %in% c("Extern", "External")) {
strandedness = "Unknown"
udg_treatment = "Unknown"
message("Cannot infer strandedness and UDG treatment for external libraries. Setting both to \"Unknown\".")

## Modern DNA
} else if (words[,1] == "Illumina") {
strandedness = "double"
udg_treatment = "none"

## dsLib
} else if (words[,1] == "dsLibrary") {
strandedness = "double"

## Non UDG
if (words[,3] == "UDG" ) {
udg_treatment = "none"

## Half UDG
} else if (words[,3] == "half") {
udg_treatment = "half"

## Full UDG
} else if (words[,3] == "full") {
udg_treatment = "full"
}

## Blanks
} else if (words[,1] == "Capture") {
udg_treatment = "none"
strandedness = "double"

## Inference failed?
} else {
message("Inference of strandedness and UDG treatment failed for library protocol '",x,"'. Setting both fields to 'Unknown'. Please fill in this informations manually.
Contact [email protected] if you think the library protocol stated could be automatically inferred.
")
udg_treatment = "Unknown"
strandedness = "Unknown"
}
return(c(strandedness, udg_treatment))
}
116 changes: 14 additions & 102 deletions pandora2eager.R → exec/pandora2eager.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,95 +8,7 @@ library(dplyr, warn.conflicts = F)
library(readr)
library(tidyr)
library(stringr)

## Infer colour chemistry from sequencer name
infer_color_chem <- function(x) {
color_chem <- NULL
if (x %in% c("K00233 (HiSeq4000)","D00829 (HiSeq2500)","M02279 (MiSeq1)", "M06210 (MiSeq2)")) {
color_chem=4
} else if (x %in% c("NS500382 (Rosa)","NS500559 (Tosca)" )) {
color_chem=2
} else if (x %in% c("MinIon 1", "MinIon 2", "MinIon HKI")) {
color_chem=NA
message("MinIon sequencing does not have color chemistry. Set to NA.")
} else {
message("Color chemistry inference was not successful. Uninferred color chemistries set to 'Unknown'. Contact: [email protected].")
color_chem="Unknown"
}
return(as.integer(color_chem))
}

## Infer strandedness and udg_treatment from protocol number
infer_library_specs <- function(x) {
udg_treatment <- NULL
strandedness <- NULL
words <- str_split(x, " " , simplify = T)
## ssLib non-UDG
if ((words[,1] == "ssLibrary" || words[,1] == "SsLibrary") && tail(words[1,],1) == "2018") {
strandedness = "single"
udg_treatment = "none"

## ssLib Unknown UDG
} else if (words[,1] == "ssLibrary" && tail(words[1,],1) == "EVA") {
message("Inference of UDG treatment failed for protocol '",x,"'. Setting to 'Unknown'.
You will need to fill in this information manually, since this protocol could refer to either UDG treatment.
")
strandedness = "single"
udg_treatment = "Unknown"

## ssLib automated non-UDG Leipzig
} else if (words[,1] == "Automated_ss_library_preparation_noUDG_EVA_CoreUnit") {
strandedness = "single"
udg_treatment = "none"

## ssLib automated half-UDG Leipzig
} else if (words[,1] == "Automated_ss_library_preparation_partialUDG_EVA_CoreUnit") {
strandedness = "single"
udg_treatment = "half"

## External
} else if (words[,1] %in% c("Extern", "External")) {
strandedness = "Unknown"
udg_treatment = "Unknown"
message("Cannot infer strandedness and UDG treatment for external libraries. Setting both to \"Unknown\".")

## Modern DNA
} else if (words[,1] == "Illumina") {
strandedness = "double"
udg_treatment = "none"

## dsLib
} else if (words[,1] == "dsLibrary") {
strandedness = "double"

## Non UDG
if (words[,3] == "UDG" ) {
udg_treatment = "none"

## Half UDG
} else if (words[,3] == "half") {
udg_treatment = "half"

## Full UDG
} else if (words[,3] == "full") {
udg_treatment = "full"
}

## Blanks
} else if (words[,1] == "Capture") {
udg_treatment = "none"
strandedness = "double"

## Inference failed?
} else {
message("Inference of strandedness and UDG treatment failed for library protocol '",x,"'. Setting both fields to 'Unknown'. Please fill in this informations manually.
Contact [email protected] if you think the library protocol stated could be automatically inferred.
")
udg_treatment = "Unknown"
strandedness = "Unknown"
}
return(c(strandedness, udg_treatment))
}
library(pandora2eager)

## Main function that queries pandora, formats info and spits out a table with the necessary information for eager.
collect_and_format_info<- function(query_list_seq, con) {
Expand All @@ -108,12 +20,12 @@ collect_and_format_info<- function(query_list_seq, con) {
)), con = con
)
)

## Get tabs of Organisms, Protocols and Sequencer names
df_list <- get_df_list(
c("TAB_Organism", "TAB_Protocol", "TAB_Sequencing_Sequencer"),con
)

results <- inner_join(complete_pandora_table, query_list_seq, by=c("sequencing.Full_Sequencing_Id"="Sequencing")) %>%
select(library.Full_Library_Id, capture.Full_Capture_Id, sequencing.Sequencer, sequencing.Sequencing_Id, individual.Full_Individual_Id, library.Protocol, individual.Organism, raw_data.FastQ_Files) %>%
## Infer protocol and Organism names from Pandora indexes
Expand All @@ -122,10 +34,10 @@ collect_and_format_info<- function(query_list_seq, con) {
Sequencer=df_list[["TAB_Sequencing_Sequencer"]][["sequencer.Name"]][`sequencing.Sequencer`]) %>%
## Infer SE/PE sequencing from number of FastQs per lane.
mutate(
num_fq=map_int(`raw_data.FastQ_Files`, function(fq) {ncol(str_split(fq, " ", simplify = T))}),
num_fq=map_int(`raw_data.FastQ_Files`, function(fq) {ncol(str_split(fq, " ", simplify = T))}),
num_r1=map(`raw_data.FastQ_Files`, function(fq) {sum(grepl("_R1_",str_split(fq, " ", simplify = T)))}),
SeqType=ifelse(num_fq == num_r1, "SE", "PE")) %>%
select(-starts_with("num_")) %>%
select(-starts_with("num_")) %>%
## Make R1 and R2 columns out of the FastQ file(s)
mutate(`raw_data.FastQ_Files`=map(`raw_data.FastQ_Files`, function(fq) {str_replace_all(fq, " ([[:graph:]]*_R2_.{3}.fastq.gz)", paste0(";","\\1"))})) %>%
separate_rows(`raw_data.FastQ_Files`, sep=" ") %>%
Expand All @@ -134,13 +46,13 @@ collect_and_format_info<- function(query_list_seq, con) {
mutate(
## Eager cannot handle same lane number for same Library_Id. Therefore lane number for additional sequencing needs to be
## artificially inflated (by 8 which is the max lane number in our sequencers). This approach has the advantage that the output
## for a given sequencing ID will be consistent and not dependent on the specific input file passed to this script.
Lane=as.integer(str_replace(`R1`,"[[:graph:]]*_L([[:digit:]]{3})_R[[:graph:]]*", "\\1"))+8*(sequencing.Sequencing_Id-1),
## for a given sequencing ID will be consistent and not dependent on the specific input file passed to this script.
Lane=as.integer(str_replace(`R1`,"[[:graph:]]*_L([[:digit:]]{3})_R[[:graph:]]*", "\\1"))+8*(sequencing.Sequencing_Id-1),
## Library Strandedness and UDG Treatment from protocol name
Strandedness=map_chr(`Protocol`, function (.) {infer_library_specs(.)[1]}),
UDG_Treatment=map_chr(`Protocol`, function(.){infer_library_specs(.)[2]}),
Strandedness=map_chr(`Protocol`, function (.) {pandora2eager::infer_library_specs(.)[1]}),
UDG_Treatment=map_chr(`Protocol`, function(.){pandora2eager::infer_library_specs(.)[2]}),
## Colour Chemistry from sequencer name
Colour_Chemistry=map_int(`Sequencer`, infer_color_chem),
Colour_Chemistry=map_int(`Sequencer`, pandora2eager::infer_color_chem),
## BAM column always set to NA
BAM=NA
) %>%
Expand Down Expand Up @@ -168,15 +80,15 @@ if (!is.na(args[3]) && args[3] == "--debug") {
write_tsv(results, "Debug_table.txt")
} else if (!is.na(args[3]) && ( args[3] == "--rename" || args[3] == "-r")) {
cat(
format_tsv(results %>%
format_tsv(results %>%
mutate(Library_ID=str_replace_all(Library_ID, "[.]", "_")) %>% ## Replace dots in the Library_ID to underscores.
select(Sample_Name, Library_ID, Lane, Colour_Chemistry,
select(Sample_Name, Library_ID, Lane, Colour_Chemistry,
SeqType, Organism, Strandedness, UDG_Treatment, R1, R2, BAM))
)
} else {
cat(
format_tsv(results %>%
select(Sample_Name, Library_ID, Lane, Colour_Chemistry,
format_tsv(results %>%
select(Sample_Name, Library_ID, Lane, Colour_Chemistry,
SeqType, Organism, Strandedness, UDG_Treatment, R1, R2, BAM))
)
}
Expand Down
17 changes: 17 additions & 0 deletions man/infer_color_chem.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/infer_library_specs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions p2e_singularity.def
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ From: rocker/tidyverse:4.1.1

%files
.credentials
pandora2eager.R
helptext.sh

%post
chmod 644 /.credentials
apt-get update && apt-get install -y libmariadb-dev
R --slave -e 'install.packages(c("remotes", "DBI", "RMariaDB"))'
R --slave -e 'remotes::install_github("sidora-tools/sidora.core", dependencies = TRUE)'
R --slave -e 'remotes::install_github("sidora-tools/pandora2eager", dependencies = TRUE)'

%runscript
#!/bin/bash
Expand All @@ -20,12 +20,12 @@ From: rocker/tidyverse:4.1.1
bash /helptext.sh
elif [[ ${#@} -gt 1 ]]; then
shift
/pandora2eager.R ${input} /.credentials $*
/usr/local/lib/R/site-library/pandora2eager/exec/pandora2eager.R ${input} /.credentials $*
else
/pandora2eager.R ${input} /.credentials
/usr/local/lib/R/site-library/pandora2eager/exec/pandora2eager.R ${input} /.credentials
fi

%labels
Author Thiseas C. Lamnidis
GithubUrl https://github.com/sidora-tools/pandora2eager.git
Version 0.2.1-beta
Version 0.2.2
Loading

0 comments on commit 247cb5b

Please sign in to comment.