diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index baeefe7..6dd8fca 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,8 +2,8 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9004 -Date: 2022-04-21 +Version: 0.9.6.9005 +Date: 2023-05-25 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index dde39fc..1edba78 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -137,7 +137,8 @@ Froh_inbreeding <- function(runs, mapFile, genome_wide=TRUE){ #' #' @param runs R object (dataframe) with ROH results #' @param mapFile Plink map file (for SNP position) -#' @param Class base ROH-length interval (in Mbps) (default: 0-2, 2-4, 4-8, 8-16, >16) +#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval, +#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals #' #' #' @return A data frame with individual inbreeding coefficients based on ROH-length of @@ -164,28 +165,10 @@ Froh_inbreeding <- function(runs, mapFile, genome_wide=TRUE){ #' Froh_inbreedingClass <- function(runs, mapFile, Class=2){ - - step_value=Class - range_mb=c(0,0,0,0,0,99999) - for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){ - range_mb[i]=step_value - step_value=step_value*2 - } - - #range_mb - name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''), - paste(range_mb[2],"-",range_mb[3],sep=''), - paste(range_mb[3],"-",range_mb[4],sep=''), - paste(range_mb[4],"-",range_mb[5],sep=''), - paste(">",range_mb[5],sep=''), - paste(">",range_mb[6],sep='')) - - # Creating the data frame - runs$MB <- runs$lengthBps/1000000 - runs$CLASS=cut(as.numeric(runs$MB),range_mb) - levels(runs$CLASS) = name_CLASS - runs$CLASS=factor(runs$CLASS) - table(runs$CLASS) + # classify runs in bins + classified_runs <- classifyRuns(runs, class_size = Class) + runs <- classified_runs$runs + range_mb <- classified_runs$range_mb LengthGenome=chromosomeLength(mapFile) @@ -226,7 +209,8 @@ Froh_inbreedingClass <- function(runs, mapFile, Class=2){ #' @param genotypeFile Plink ped file (for SNP position) #' @param mapFile Plink map file (for SNP position) #' @param runs R object (dataframe) with results on detected runs -#' @param Class group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16) +#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval, +#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals #' @param snpInRuns TRUE/FALSE (default): should the function \code{snpInsideRuns} be #' called to compute the proportion of times each SNP falls inside a run in the #' group/population? @@ -284,8 +268,6 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ MB <- NULL chrom <- NULL - n_class=Class - result_Froh_genome_wide <- Froh_inbreeding(runs = runs, mapFile = mapFile, genome_wide = TRUE) @@ -294,37 +276,14 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ genome_wide = FALSE) result_Froh_class <- Froh_inbreedingClass(runs = runs, mapFile = mapFile, - Class = n_class) - - - runs$MB <- runs$lengthBps/1000000 - head(runs) - #step_value=2 + Class = Class) - range_mb <- c(0,0,0,0,0,99999) - - for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){ - range_mb[i]=n_class - n_class=n_class*2 - } - - #range_mb - name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''), - paste(range_mb[2],"-",range_mb[3],sep=''), - paste(range_mb[3],"-",range_mb[4],sep=''), - paste(range_mb[4],"-",range_mb[5],sep=''), - paste(">",range_mb[5],sep=''), - paste(">",range_mb[6],sep='')) - - message(paste("Class created:" ,name_CLASS[0:5],sep=' ')) - runs$CLASS=cut(as.numeric(runs$MB),range_mb) - levels(runs$CLASS) = name_CLASS - runs$CLASS=factor(runs$CLASS) + # classify runs in bins + runs <- classifyRuns(runs, class_size = Class)$runs #RESULTS!!!!! summary_ROH_mean1 = ddply(runs,.(group,CLASS),summarize,sum=mean(MB)) summary_ROH_mean_class = dcast(summary_ROH_mean1,CLASS ~ group ,value.var = "sum") - levels(summary_ROH_mean_class$CLASS) = name_CLASS[0:5] #RESULTS!!!!! summary_ROH_mean_chr1 = ddply(runs,.(group,chrom),summarize,sum=mean(MB)) @@ -404,7 +363,5 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ message("Calculation % SNP in ROH finish") #FILIPPO } - - return(result_summary) } diff --git a/detectRUNS/R/funktionen.R b/detectRUNS/R/funktionen.R index 1f247b9..75c1e47 100644 --- a/detectRUNS/R/funktionen.R +++ b/detectRUNS/R/funktionen.R @@ -21,29 +21,29 @@ genoConvert <- function(x) { #' Read from a .map file locations and return a data.table object -#' -#' This is an utility function which check for file existance, define +#' +#' This is an utility function which check for file existance, define #' colClasses and then returns the read data.table object #' @param mapFile map file (.map) file path #' @keywords internal #' @return data.table object -#' +#' readMapFile <- function(mapFile) { # define colClasses colClasses <- c("character", "character", "character", "numeric") - + if(file.exists(mapFile)){ # using data.table to read data mappa <- data.table::fread(mapFile, header = F, colClasses = colClasses) } else { stop(paste("file", mapFile, "doesn't exists")) } - + # set column names names(mappa) <- c("CHR","SNP_NAME","x","POSITION") mappa$x <- NULL - + return(mappa) } @@ -751,7 +751,7 @@ consecutiveRuns <- function(indGeno, individual, mapFile, ROHet=TRUE, minSNP=3, #' } #' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") #' newData=readExternalRuns(runsFile, program = 'detectRUNS') -#' +#' readExternalRuns <- function(inputFile=NULL,program=c("plink","BCFtools","detectRUNS")) { @@ -822,3 +822,45 @@ reorderDF <- function(dfx) { return(ordered_dfx) } + + +#' Classify runs in bins. +#' +#' @param runs a ROH dataframe object +#' @param class_size base ROH-length interval (in Mbps). Will be doubled in each interval, +#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals +#' +#' @return a list with runs and range_mb fields: runs keeps a modified version of +#' the original runs dataframe with two additional columns, MB for ROH length in +#' megabases and a CLASS column which tags a ROH in a proper bin relying on size; +#' range_mb field return a list of ranges in MB used to define the classes +#' + +classifyRuns <- function(runs, class_size=2) { + # calculate ROH sizes in MB + runs$MB <- runs$lengthBps/1000000 + + # this is required to classify runs in bins + range_mb <- c(0,0,0,0,0,99999) + + for (i in seq(from = 2 , to = length(range_mb) - 1, by = 1) ) { + range_mb[i] <- class_size + class_size <- class_size * 2 + } + + # using intervals to construct labels + name_CLASS <- c( + paste(range_mb[1], "-", range_mb[2], sep = ''), + paste(range_mb[2], "-", range_mb[3], sep = ''), + paste(range_mb[3], "-", range_mb[4], sep = ''), + paste(range_mb[4], "-", range_mb[5], sep = ''), + paste(">", range_mb[5], sep = '') + ) + + message("Class created: ", paste(name_CLASS[0:5], collapse = ' ')) + runs$CLASS <- cut(as.numeric(runs$MB), range_mb) + levels(runs$CLASS) <- name_CLASS + runs$CLASS <- factor(runs$CLASS) + + return(list("runs" = runs, "range_mb" = range_mb)) +} diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R index c985038..9c238a7 100644 --- a/detectRUNS/R/plots.R +++ b/detectRUNS/R/plots.R @@ -838,7 +838,8 @@ plot_InbreedingChr<- function(runs, mapFile , groupSplit=TRUE, style=c("ChrBarPl #' @param savePlots should plots be saved out to files or plotted in the graphical terminal (default)? #' @param outputName title prefix (the base name of graph, if savePlots is TRUE)#' #' @param plotTitle title in plot (default NULL) -#' @param Class group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16) +#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval, +#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals #' #' @return plot Distribution Runs #' @export @@ -858,7 +859,7 @@ plot_InbreedingChr<- function(runs, mapFile , groupSplit=TRUE, style=c("ChrBarPl #' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") #' runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') #' -#' plot_InbreedingChr(runs = runsDF, mapFile = mapFile, style='All') +#' plot_DistributionRuns(runs = runsDF, mapFile = mapFile, style='All') #' plot_DistributionRuns <- function(runs, mapFile , groupSplit=TRUE, style=c("MeanClass","MeanChr","RunsPCT","RunsPCT_Chr","All") , @@ -896,33 +897,12 @@ plot_DistributionRuns <- function(runs, mapFile , groupSplit=TRUE, style=c("Mean } - step_value=Class - range_mb=c(0,0,0,0,0,99999) - for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){ - range_mb[i]=step_value - step_value=step_value*2 - } - - #range_mb - name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''), - paste(range_mb[2],"-",range_mb[3],sep=''), - paste(range_mb[3],"-",range_mb[4],sep=''), - paste(range_mb[4],"-",range_mb[5],sep=''), - paste(">",range_mb[5],sep=''), - paste(">",range_mb[6],sep='')) - - # Creating the data frame - runs$MB <- runs$lengthBps/1000000 - runs$CLASS=cut(as.numeric(runs$MB),range_mb) - levels(runs$CLASS) = name_CLASS - runs$CLASS=factor(runs$CLASS) - - head(runs) + # classify runs in bins + runs <- classifyRuns(runs, class_size = Class)$runs #RESULTS!!!!! summary_ROH_mean1 = ddply(runs,.(group,CLASS),summarize,sum=mean(MB)) summary_ROH_mean_class = dcast(summary_ROH_mean1,CLASS ~ group ,value.var = "sum") - levels(summary_ROH_mean_class$CLASS) = name_CLASS[0:5] #RESULTS!!!!! diff --git a/detectRUNS/man/Froh_inbreedingClass.Rd b/detectRUNS/man/Froh_inbreedingClass.Rd index a57c658..1999430 100644 --- a/detectRUNS/man/Froh_inbreedingClass.Rd +++ b/detectRUNS/man/Froh_inbreedingClass.Rd @@ -11,7 +11,8 @@ Froh_inbreedingClass(runs, mapFile, Class = 2) \item{mapFile}{Plink map file (for SNP position)} -\item{Class}{base ROH-length interval (in Mbps) (default: 0-2, 2-4, 4-8, 8-16, >16)} +\item{Class}{base ROH-length interval (in Mbps). Will be doubled in each interval, +for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals} } \value{ A data frame with individual inbreeding coefficients based on ROH-length of diff --git a/detectRUNS/man/classifyRuns.Rd b/detectRUNS/man/classifyRuns.Rd new file mode 100644 index 0000000..5ca70f6 --- /dev/null +++ b/detectRUNS/man/classifyRuns.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/funktionen.R +\name{classifyRuns} +\alias{classifyRuns} +\title{Classify runs in bins.} +\usage{ +classifyRuns(runs, class_size = 2) +} +\arguments{ +\item{runs}{a ROH dataframe object} + +\item{class_size}{base ROH-length interval (in Mbps). Will be doubled in each interval, +for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals} +} +\value{ +a list with runs and range_mb fields: runs keeps a modified version of +the original runs dataframe with two additional columns, MB for ROH length in +megabases and a CLASS column which tags a ROH in a proper bin relying on size; +range_mb field return a list of ranges in MB used to define the classes +} +\description{ +Classify runs in bins. +} diff --git a/detectRUNS/man/plot_DistributionRuns.Rd b/detectRUNS/man/plot_DistributionRuns.Rd index 5fc678d..eb521de 100644 --- a/detectRUNS/man/plot_DistributionRuns.Rd +++ b/detectRUNS/man/plot_DistributionRuns.Rd @@ -30,7 +30,8 @@ plot_DistributionRuns( \item{plotTitle}{title in plot (default NULL)} -\item{Class}{group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16)} +\item{Class}{base ROH-length interval (in Mbps). Will be doubled in each interval, +for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals} } \value{ plot Distribution Runs @@ -56,6 +57,6 @@ ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, m runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') -plot_InbreedingChr(runs = runsDF, mapFile = mapFile, style='All') +plot_DistributionRuns(runs = runsDF, mapFile = mapFile, style='All') } diff --git a/detectRUNS/man/readMapFile.Rd b/detectRUNS/man/readMapFile.Rd index 2698a13..a23a9f4 100644 --- a/detectRUNS/man/readMapFile.Rd +++ b/detectRUNS/man/readMapFile.Rd @@ -13,7 +13,7 @@ readMapFile(mapFile) data.table object } \description{ -This is an utility function which check for file existance, define +This is an utility function which check for file existance, define colClasses and then returns the read data.table object } \keyword{internal} diff --git a/detectRUNS/man/summaryRuns.Rd b/detectRUNS/man/summaryRuns.Rd index c09eec7..51c4a5e 100644 --- a/detectRUNS/man/summaryRuns.Rd +++ b/detectRUNS/man/summaryRuns.Rd @@ -13,7 +13,8 @@ summaryRuns(runs, mapFile, genotypeFile, Class = 2, snpInRuns = FALSE) \item{genotypeFile}{Plink ped file (for SNP position)} -\item{Class}{group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16)} +\item{Class}{base ROH-length interval (in Mbps). Will be doubled in each interval, +for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals} \item{snpInRuns}{TRUE/FALSE (default): should the function \code{snpInsideRuns} be called to compute the proportion of times each SNP falls inside a run in the diff --git a/detectRUNS/tests/testthat/test_functions.R b/detectRUNS/tests/testthat/test_functions.R index 6e066bb..373eed7 100644 --- a/detectRUNS/tests/testthat/test_functions.R +++ b/detectRUNS/tests/testthat/test_functions.R @@ -918,3 +918,63 @@ test_that("Testing snpInsideRuns with CHR as strings", { # testing functions expect_equivalent(test, reference) }) + +testthat::test_that("Testing classify ROH by sizes", { + # loading pre-calculated data + runsFile <- "test.ROHet.sliding.csv" + colClasses <- c(rep("character", 3), rep("numeric", 4)) + + # test first 5 rows + runs <- read.csv2( + runsFile, header = TRUE, stringsAsFactors = FALSE, + colClasses = colClasses, nrows = 5) + + # calling function + results <- classifyRuns(runs, class_size = 2) + test <- results$runs + test_range_mb <- results$range_mb + + # scale length to MB and test + MB <- runs$lengthBps / 1000000 + expect_equivalent(test$MB, MB) + + # testing intervals + range_mb = c(0, 2, 4, 8, 16, 99999) + expect_equivalent(test_range_mb, range_mb) + + # test for classes + classes <- as.factor(c("2-4", "0-2", "4-8", "0-2", "2-4")) + expect_equivalent(test$CLASS, classes) + + # calling function with a different size + results <- classifyRuns(runs, class_size = 1) + test <- results$runs + test_range_mb <- results$range_mb + + # MB are the same + expect_equivalent(test$MB, MB) + + # testing intervals + range_mb = c(0, 1, 2, 4, 8, 99999) + expect_equivalent(test_range_mb, range_mb) + + # test for classes + classes <- as.factor(c("2-4", "1-2", "4-8", "1-2", "2-4")) + expect_equivalent(test$CLASS, classes) + + # calling function with a different size + results <- classifyRuns(runs, class_size = .2) + test <- results$runs + test_range_mb <- results$range_mb + + # MB are the same + expect_equivalent(test$MB, MB) + + # testing intervals + range_mb = c(0, .2, .4, .8, 1.6, 99999) + expect_equivalent(test_range_mb, range_mb) + + # test for classes + classes <- as.factor(c(">1.6", "0.8-1.6", ">1.6", "0.8-1.6", ">1.6")) + expect_equivalent(test$CLASS, classes) +})