Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 plot_Distribution report wrong labels with classes based mean graphs #42

Open
wants to merge 4 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions detectRUNS/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Package: detectRUNS
Type: Package
Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
Diploid Genomes
Version: 0.9.6.9004
Date: 2022-04-21
Version: 0.9.6.9005
Date: 2023-05-25
Authors@R: c(
person("Filippo","Biscarini", email="[email protected]", role=c("aut", "cre")),
person("Paolo","Cozzi", email="[email protected]", role="aut"),
Expand Down
65 changes: 11 additions & 54 deletions detectRUNS/R/Stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ Froh_inbreeding <- function(runs, mapFile, genome_wide=TRUE){
#'
#' @param runs R object (dataframe) with ROH results
#' @param mapFile Plink map file (for SNP position)
#' @param Class base ROH-length interval (in Mbps) (default: 0-2, 2-4, 4-8, 8-16, >16)
#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval,
#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals
#'
#'
#' @return A data frame with individual inbreeding coefficients based on ROH-length of
Expand All @@ -164,28 +165,10 @@ Froh_inbreeding <- function(runs, mapFile, genome_wide=TRUE){
#'

Froh_inbreedingClass <- function(runs, mapFile, Class=2){

step_value=Class
range_mb=c(0,0,0,0,0,99999)
for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){
range_mb[i]=step_value
step_value=step_value*2
}

#range_mb
name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''),
paste(range_mb[2],"-",range_mb[3],sep=''),
paste(range_mb[3],"-",range_mb[4],sep=''),
paste(range_mb[4],"-",range_mb[5],sep=''),
paste(">",range_mb[5],sep=''),
paste(">",range_mb[6],sep=''))

# Creating the data frame
runs$MB <- runs$lengthBps/1000000
runs$CLASS=cut(as.numeric(runs$MB),range_mb)
levels(runs$CLASS) = name_CLASS
runs$CLASS=factor(runs$CLASS)
table(runs$CLASS)
# classify runs in bins
classified_runs <- classifyRuns(runs, class_size = Class)
runs <- classified_runs$runs
range_mb <- classified_runs$range_mb

LengthGenome=chromosomeLength(mapFile)

Expand Down Expand Up @@ -226,7 +209,8 @@ Froh_inbreedingClass <- function(runs, mapFile, Class=2){
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param Class group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16)
#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval,
#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals
#' @param snpInRuns TRUE/FALSE (default): should the function \code{snpInsideRuns} be
#' called to compute the proportion of times each SNP falls inside a run in the
#' group/population?
Expand Down Expand Up @@ -284,8 +268,6 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
MB <- NULL
chrom <- NULL

n_class=Class

result_Froh_genome_wide <- Froh_inbreeding(runs = runs,
mapFile = mapFile,
genome_wide = TRUE)
Expand All @@ -294,37 +276,14 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
genome_wide = FALSE)
result_Froh_class <- Froh_inbreedingClass(runs = runs,
mapFile = mapFile,
Class = n_class)


runs$MB <- runs$lengthBps/1000000
head(runs)
#step_value=2
Class = Class)

range_mb <- c(0,0,0,0,0,99999)

for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){
range_mb[i]=n_class
n_class=n_class*2
}

#range_mb
name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''),
paste(range_mb[2],"-",range_mb[3],sep=''),
paste(range_mb[3],"-",range_mb[4],sep=''),
paste(range_mb[4],"-",range_mb[5],sep=''),
paste(">",range_mb[5],sep=''),
paste(">",range_mb[6],sep=''))

message(paste("Class created:" ,name_CLASS[0:5],sep=' '))
runs$CLASS=cut(as.numeric(runs$MB),range_mb)
levels(runs$CLASS) = name_CLASS
runs$CLASS=factor(runs$CLASS)
# classify runs in bins
runs <- classifyRuns(runs, class_size = Class)$runs

#RESULTS!!!!!
summary_ROH_mean1 = ddply(runs,.(group,CLASS),summarize,sum=mean(MB))
summary_ROH_mean_class = dcast(summary_ROH_mean1,CLASS ~ group ,value.var = "sum")
levels(summary_ROH_mean_class$CLASS) = name_CLASS[0:5]

#RESULTS!!!!!
summary_ROH_mean_chr1 = ddply(runs,.(group,chrom),summarize,sum=mean(MB))
Expand Down Expand Up @@ -404,7 +363,5 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
message("Calculation % SNP in ROH finish") #FILIPPO
}



return(result_summary)
}
56 changes: 49 additions & 7 deletions detectRUNS/R/funktionen.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,29 @@ genoConvert <- function(x) {


#' Read from a .map file locations and return a data.table object
#'
#' This is an utility function which check for file existance, define
#'
#' This is an utility function which check for file existance, define
#' colClasses and then returns the read data.table object
#' @param mapFile map file (.map) file path
#' @keywords internal
#' @return data.table object
#'
#'

readMapFile <- function(mapFile) {
# define colClasses
colClasses <- c("character", "character", "character", "numeric")

if(file.exists(mapFile)){
# using data.table to read data
mappa <- data.table::fread(mapFile, header = F, colClasses = colClasses)
} else {
stop(paste("file", mapFile, "doesn't exists"))
}

# set column names
names(mappa) <- c("CHR","SNP_NAME","x","POSITION")
mappa$x <- NULL

return(mappa)
}

Expand Down Expand Up @@ -751,7 +751,7 @@ consecutiveRuns <- function(indGeno, individual, mapFile, ROHet=TRUE, minSNP=3,
#' }
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
#' newData=readExternalRuns(runsFile, program = 'detectRUNS')
#'
#'

readExternalRuns <- function(inputFile=NULL,program=c("plink","BCFtools","detectRUNS")) {

Expand Down Expand Up @@ -822,3 +822,45 @@ reorderDF <- function(dfx) {

return(ordered_dfx)
}


#' Classify runs in bins.
#'
#' @param runs a ROH dataframe object
#' @param class_size base ROH-length interval (in Mbps). Will be doubled in each interval,
#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals
#'
#' @return a list with runs and range_mb fields: runs keeps a modified version of
#' the original runs dataframe with two additional columns, MB for ROH length in
#' megabases and a CLASS column which tags a ROH in a proper bin relying on size;
#' range_mb field return a list of ranges in MB used to define the classes
#'

classifyRuns <- function(runs, class_size=2) {
# calculate ROH sizes in MB
runs$MB <- runs$lengthBps/1000000

# this is required to classify runs in bins
range_mb <- c(0,0,0,0,0,99999)

for (i in seq(from = 2 , to = length(range_mb) - 1, by = 1) ) {
range_mb[i] <- class_size
class_size <- class_size * 2
}

# using intervals to construct labels
name_CLASS <- c(
paste(range_mb[1], "-", range_mb[2], sep = ''),
paste(range_mb[2], "-", range_mb[3], sep = ''),
paste(range_mb[3], "-", range_mb[4], sep = ''),
paste(range_mb[4], "-", range_mb[5], sep = ''),
paste(">", range_mb[5], sep = '')
)

message("Class created: ", paste(name_CLASS[0:5], collapse = ' '))
runs$CLASS <- cut(as.numeric(runs$MB), range_mb)
levels(runs$CLASS) <- name_CLASS
runs$CLASS <- factor(runs$CLASS)

return(list("runs" = runs, "range_mb" = range_mb))
}
30 changes: 5 additions & 25 deletions detectRUNS/R/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,8 @@ plot_InbreedingChr<- function(runs, mapFile , groupSplit=TRUE, style=c("ChrBarPl
#' @param savePlots should plots be saved out to files or plotted in the graphical terminal (default)?
#' @param outputName title prefix (the base name of graph, if savePlots is TRUE)#'
#' @param plotTitle title in plot (default NULL)
#' @param Class group of length (in Mbps) by class (default: 0-2, 2-4, 4-8, 8-16, >16)
#' @param Class base ROH-length interval (in Mbps). Will be doubled in each interval,
#' for example the default value 2 create 0-2, 2-4, 4-8, 8-16 and >16 intervals
#'
#' @return plot Distribution Runs
#' @export
Expand All @@ -858,7 +859,7 @@ plot_InbreedingChr<- function(runs, mapFile , groupSplit=TRUE, style=c("ChrBarPl
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
#' runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
#'
#' plot_InbreedingChr(runs = runsDF, mapFile = mapFile, style='All')
#' plot_DistributionRuns(runs = runsDF, mapFile = mapFile, style='All')
#'

plot_DistributionRuns <- function(runs, mapFile , groupSplit=TRUE, style=c("MeanClass","MeanChr","RunsPCT","RunsPCT_Chr","All") ,
Expand Down Expand Up @@ -896,33 +897,12 @@ plot_DistributionRuns <- function(runs, mapFile , groupSplit=TRUE, style=c("Mean
}


step_value=Class
range_mb=c(0,0,0,0,0,99999)
for (i in seq(from = 2 , to= length(range_mb)-1, by = 1) ){
range_mb[i]=step_value
step_value=step_value*2
}

#range_mb
name_CLASS=c(paste(range_mb[1],"-",range_mb[2],sep=''),
paste(range_mb[2],"-",range_mb[3],sep=''),
paste(range_mb[3],"-",range_mb[4],sep=''),
paste(range_mb[4],"-",range_mb[5],sep=''),
paste(">",range_mb[5],sep=''),
paste(">",range_mb[6],sep=''))

# Creating the data frame
runs$MB <- runs$lengthBps/1000000
runs$CLASS=cut(as.numeric(runs$MB),range_mb)
levels(runs$CLASS) = name_CLASS
runs$CLASS=factor(runs$CLASS)

head(runs)
# classify runs in bins
runs <- classifyRuns(runs, class_size = Class)$runs

#RESULTS!!!!!
summary_ROH_mean1 = ddply(runs,.(group,CLASS),summarize,sum=mean(MB))
summary_ROH_mean_class = dcast(summary_ROH_mean1,CLASS ~ group ,value.var = "sum")
levels(summary_ROH_mean_class$CLASS) = name_CLASS[0:5]


#RESULTS!!!!!
Expand Down
3 changes: 2 additions & 1 deletion detectRUNS/man/Froh_inbreedingClass.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions detectRUNS/man/classifyRuns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions detectRUNS/man/plot_DistributionRuns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion detectRUNS/man/readMapFile.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion detectRUNS/man/summaryRuns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading