From 7c6bd2d10f36e284c6cfecb53a08cbe4cadf42f5 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 28 Mar 2022 19:06:35 +0200 Subject: [PATCH 01/23] :white_check_mark: test tableRuns --- detectRUNS/DESCRIPTION | 6 +++--- detectRUNS/man/tableRuns.Rd | 5 +++-- detectRUNS/src/RcppExports.cpp | 5 +++++ detectRUNS/tests/testthat/test.tableRuns.csv | 4 ++++ detectRUNS/tests/testthat/test_Stats.R | 21 ++++++++++++++++++++ 5 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 detectRUNS/tests/testthat/test.tableRuns.csv create mode 100644 detectRUNS/tests/testthat/test_Stats.R diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index a4e7595..56220c4 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,11 +2,11 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9000 +Version: 0.9.6.9001 Date: 2020-12-20 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), - person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"), + person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), person("Giustino","Gaspa", email="gigaspa@uniss.it", role="aut"), person("Gabriele","Marras", email="gmarras@uoguelph.ca", role="aut") ) @@ -28,7 +28,7 @@ Imports: Rcpp, gridExtra, data.table -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Suggests: testthat, knitr, diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 83c2da8..1e2bf7d 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -27,8 +27,9 @@ proportion of individuals carrying that run (e.g. 70\%)} } \value{ A dataframe with the most common runs detected in the sampled individuals -(the group/population, start and end position of the run, chromosome and number of SNP -included in the run are reported in the output dataframe) +(the group/population, start and end position of the run, chromosome, number of SNP +included in the run and average percentage of SNPs in run +are reported in the output dataframe) } \description{ This function takes in input either the run results or the output from diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index c2b5579..e4721e6 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // fast_factor SEXP fast_factor(SEXP x); RcppExport SEXP _detectRUNS_fast_factor(SEXP xSEXP) { diff --git a/detectRUNS/tests/testthat/test.tableRuns.csv b/detectRUNS/tests/testthat/test.tableRuns.csv new file mode 100644 index 0000000..fa6d084 --- /dev/null +++ b/detectRUNS/tests/testthat/test.tableRuns.csv @@ -0,0 +1,4 @@ +"Group";"Start_SNP";"End_SNP";"chrom";"nSNP";"from";"to";"avg_pct" +"Jacobs";"s07830.1";"s20481.1";"24";9;16041579;16811910;60 +"Jacobs";"s15456.1";"s30694.1";"24";12;31774063;32349389;60 +"Jacobs";"s59571.1";"s13600.1";"24";6;33056732;33397677;60 diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R new file mode 100644 index 0000000..7a9dcc0 --- /dev/null +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -0,0 +1,21 @@ +library(testthat) +library(detectRUNS) +context("Testing Stats") + +# get file paths +genotypeFile <- "test.ped" +mapFile <- "test.map" +rawFile <- "test.raw" +runsFile <- "test.ROHet.consecutive.csv" + +test_that("Test tableRuns", { + colClasses <- c("factor", "character", "character", "character", "numeric", + "integer", "integer", "numeric") + reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) + levels(reference$Group) <- c("Jacobs", "Navajo-Churro") + + runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') + test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) + + expect_equal(reference, test) +}) From 0362a333cfa42f7dcb058cafbbf605bad6bd320b Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 28 Mar 2022 19:21:44 +0200 Subject: [PATCH 02/23] :fire: remove SnpInRuns from tableRuns parameters snpInsideRuns(Cpp) is now internal --- detectRUNS/R/Stats.R | 79 +++++++++++++---------------- detectRUNS/man/tableRuns.Rd | 18 ++----- performance/test-snpInsideRunsCpp.R | 26 ++++++++++ 3 files changed, 64 insertions(+), 59 deletions(-) create mode 100644 performance/test-snpInsideRunsCpp.R diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index c1d7cf3..f076675 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -397,10 +397,9 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ #' Function to retrieve most common runs in the population #' -#' This function takes in input either the run results or the output from -#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run) -#' in the population/group, and returns a subset of the runs most commonly -#' found in the group/population. The parameter \code{threshold} controls the definition +#' This function takes in input either the run results and returns a subset of +#' the runs most commonly found in the group/population. The parameter +#' \code{threshold} controls the definition #' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) #' #' @param genotypeFile Plink ped file (for SNP position) @@ -408,8 +407,6 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ #' @param runs R object (dataframe) with results on detected runs #' @param threshold value from 0 to 1 (default 0.7) that controls the desired #' proportion of individuals carrying that run (e.g. 70\%) -#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a -#' run in the population (output from \code{snpInsideRuns}) #' #' @return A dataframe with the most common runs detected in the sampled individuals #' (the group/population, start and end position of the run, chromosome, number of SNP @@ -435,7 +432,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ #' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) #' -tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) { +tableRuns <- function(runs=NULL, genotypeFile, mapFile, threshold = 0.5) { #set a threshold threshold_used=threshold*100 @@ -444,45 +441,37 @@ tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold # read map file mappa <- readMapFile(mapFile) - if(!is.null(runs) & is.null(SnpInRuns)){ - message('I found only Runs data frame. GOOD!') - - #change colnames in runs file - names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") - - #Start calculation % SNP in ROH - message("Calculation % SNP in ROH") #FILIPPO - all_SNPinROH <- data.frame("SNP_NAME"=character(), - "CHR"=integer(), - "POSITION"=numeric(), - "COUNT"=integer(), - "BREED"=factor(), - "PERCENTAGE"=numeric(), - stringsAsFactors=FALSE) - - # create progress bar - total <- length(unique(runs$CHROMOSOME)) - message(paste('Chromosome founds: ',total)) #FILIPPO - n=0 - pb <- txtProgressBar(min = 0, max = total, style = 3) - - #SNP in ROH - for (chrom in sort(unique(runs$CHROMOSOME))) { - runsChrom <- runs[runs$CHROMOSOME==chrom,] - mapKrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) - all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) - n=n+1 - setTxtProgressBar(pb, n) - } - close(pb) - message("Calculation % SNP in ROH finish") #FILIPPO - } else if (is.null(runs) & !is.null(SnpInRuns)) { - message('I found only SNPinRuns data frame. GOOD!') - all_SNPinROH=SnpInRuns - } else{ - stop('You gave me Runs and SNPinRuns! Please choose one!') + #change colnames in runs file + names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") + + #Start calculation % SNP in ROH + message("Calculation % SNP in ROH") #FILIPPO + all_SNPinROH <- data.frame("SNP_NAME"=character(), + "CHR"=integer(), + "POSITION"=numeric(), + "COUNT"=integer(), + "BREED"=factor(), + "PERCENTAGE"=numeric(), + stringsAsFactors=FALSE) + + # create progress bar + total <- length(unique(runs$CHROMOSOME)) + message(paste('Chromosome founds: ',total)) #FILIPPO + n=0 + pb <- txtProgressBar(min = 0, max = total, style = 3) + + #SNP in ROH + for (chrom in sort(unique(runs$CHROMOSOME))) { + runsChrom <- runs[runs$CHROMOSOME==chrom,] + mapKrom <- mappa[mappa$CHR==chrom,] + snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) + all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) + n=n+1 + setTxtProgressBar(pb, n) } + close(pb) + + message("Calculation % SNP in ROH finish") #FILIPPO #consecutive number all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE)) diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 1e2bf7d..28dfec8 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -4,20 +4,11 @@ \alias{tableRuns} \title{Function to retrieve most common runs in the population} \usage{ -tableRuns( - runs = NULL, - SnpInRuns = NULL, - genotypeFile, - mapFile, - threshold = 0.5 -) +tableRuns(runs = NULL, genotypeFile, mapFile, threshold = 0.5) } \arguments{ \item{runs}{R object (dataframe) with results on detected runs} -\item{SnpInRuns}{dataframe with the proportion of times each SNP falls inside a -run in the population (output from \code{snpInsideRuns})} - \item{genotypeFile}{Plink ped file (for SNP position)} \item{mapFile}{Plink map file (for SNP position)} @@ -32,10 +23,9 @@ included in the run and average percentage of SNPs in run are reported in the output dataframe) } \description{ -This function takes in input either the run results or the output from -the function \code{snpInsideRuns} (proportion of times a SNP is inside a run) -in the population/group, and returns a subset of the runs most commonly -found in the group/population. The parameter \code{threshold} controls the definition +This function takes in input either the run results and returns a subset of +the runs most commonly found in the group/population. The parameter +\code{threshold} controls the definition of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) } \examples{ diff --git a/performance/test-snpInsideRunsCpp.R b/performance/test-snpInsideRunsCpp.R new file mode 100644 index 0000000..a59ec19 --- /dev/null +++ b/performance/test-snpInsideRunsCpp.R @@ -0,0 +1,26 @@ + +# get files path +mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package="detectRUNS") +genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS") +runsfile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") + +# read mapfile with custom methods +mappa <- readMapFile(mapFile) + +# this is the chromosome I want to test +chrom <- "24" + +# subsetting mapChrom (get x random snps from mapfile) +set.seed(42) +mapChrom <- mappa[mappa$CHR==chrom, ] +mapChrom <- mapChrom[sort(sample(nrow(mapChrom), 10)), ] + +# loading pre-calculated data +runs <- readExternalRuns(runsfile, program="detectRUNS") +names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") +runsChrom <- runs[runs$CHROMOSOME==chrom, ] + +# get snps inside runs +for (i in 1:100000) { + test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) +} From 9c74576f46ac05fad13e786ad419b36c8c547711 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 28 Mar 2022 20:06:30 +0200 Subject: [PATCH 03/23] :zap: calculate tableRuns by chrom --- detectRUNS/R/Stats.R | 193 ++++++++++++++++++------------------ detectRUNS/man/tableRuns.Rd | 14 +-- 2 files changed, 103 insertions(+), 104 deletions(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index f076675..7c82365 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -397,8 +397,8 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ #' Function to retrieve most common runs in the population #' -#' This function takes in input either the run results and returns a subset of -#' the runs most commonly found in the group/population. The parameter +#' This function takes in input either the run results and returns a subset of +#' the runs most commonly found in the group/population. The parameter #' \code{threshold} controls the definition #' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) #' @@ -422,129 +422,126 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ #' # calculating runs of Homozygosity #' \dontrun{ #' # skipping runs calculation -#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15, -#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000) +#' runs <- slidingRUNS.run(genotypeFile, mapFile, +#' windowSize = 15, threshold = 0.1, minSNP = 15, +#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +#' ) #' } #' # loading pre-calculated data -#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") -#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS') +#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") #' #' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) #' +tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { -tableRuns <- function(runs=NULL, genotypeFile, mapFile, threshold = 0.5) { - - #set a threshold - threshold_used=threshold*100 - message(paste('Threshold used:',threshold_used)) + # set a threshold + threshold_used <- threshold * 100 + message(paste("Threshold used:", threshold_used)) # read map file mappa <- readMapFile(mapFile) - #change colnames in runs file - names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") + # change colnames in runs file + names(runs) <- c("POPULATION", "IND", "CHROMOSOME", "COUNT", "START", "END", "LENGTH") - #Start calculation % SNP in ROH - message("Calculation % SNP in ROH") #FILIPPO - all_SNPinROH <- data.frame("SNP_NAME"=character(), - "CHR"=integer(), - "POSITION"=numeric(), - "COUNT"=integer(), - "BREED"=factor(), - "PERCENTAGE"=numeric(), - stringsAsFactors=FALSE) + # vector of breeds + group_list <- as.vector(unique(runs$POPULATION)) + + # final data frame + final_table <- data.frame( + "GROUP" = character(0), "Start_SNP" = character(0), "End_SNP" = character(0), + "chrom" = character(0), "nSNP" = integer(0), "from" = integer(0), "to" = integer(0), "avg_pct" = numeric(0) + ) + + # Start calculation % SNP in ROH + message("Calculation % SNP in ROH") # FILIPPO # create progress bar total <- length(unique(runs$CHROMOSOME)) - message(paste('Chromosome founds: ',total)) #FILIPPO - n=0 + message(paste("Chromosome founds: ", total)) # FILIPPO + n <- 0 pb <- txtProgressBar(min = 0, max = total, style = 3) - #SNP in ROH + # SNP in ROH for (chrom in sort(unique(runs$CHROMOSOME))) { - runsChrom <- runs[runs$CHROMOSOME==chrom,] - mapKrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) - all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) - n=n+1 - setTxtProgressBar(pb, n) - } - close(pb) - - message("Calculation % SNP in ROH finish") #FILIPPO - - #consecutive number - all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE)) - - #final data frame - final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0), - "chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0), "avg_pct"=numeric(0)) - - - #vector of breeds - group_list=as.vector(unique(all_SNPinROH$BREED)) + # extract from map and runs only the chromosome I need + runsChrom <- runs[runs$CHROMOSOME == chrom, ] + mapKrom <- mappa[mappa$CHR == chrom, ] - for (grp in group_list){ - message(paste('checking: ',grp)) + # calculate snpInsideRuns + snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile) - #create subset for group/thresold - group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,]) + # add a column with the row names as integer + snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns)) - #print(group_subset) + for (grp in group_list) { + # create subset for group/thresold + group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE > threshold_used, ] - #variable - old_pos=group_subset[1,7] - snp_pos1=group_subset[1,3] - Start_SNP=group_subset[1,1] - snp_count=0 - sum_pct = 0 - - x=2 - while(x <= length(rownames(group_subset))) { - - snp_count = snp_count + 1 - new_pos=group_subset[x,7] - old_pos=group_subset[x-1,7] - chr_old=group_subset[x-1,2] - chr_new =group_subset[x,2] - sum_pct = sum_pct + group_subset[x-1,"PERCENTAGE"] - - diff=new_pos-old_pos + # after filtering, I need to have at least 2 rows or I can't do the following stuff + if (nrow(group_subset) <= 2) { + break + } - if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) { - if (x==length(rownames(group_subset))){ - end_SNP=group_subset[x,1] - TO=group_subset[x,3] - }else{ - end_SNP=group_subset[x-1,1] - TO=group_subset[x-1,3] + # initialize stuff + old_pos <- group_subset[1, 7] + snp_pos1 <- group_subset[1, 3] + Start_SNP <- group_subset[1, 1] + snp_count <- 0 + sum_pct <- 0 + + for (x in 2:nrow(group_subset)) { + snp_count <- snp_count + 1 + new_pos <- group_subset[x, 7] + old_pos <- group_subset[x - 1, 7] + chr_old <- group_subset[x - 1, 2] + chr_new <- group_subset[x, 2] + sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"] + + # this will be 1 in case of adjacent rows + diff <- new_pos - old_pos + + if ((diff > 1) | (chr_new != chr_old) | x == nrow(group_subset)) { + if (x == nrow(group_subset)) { + end_SNP <- group_subset[x, 1] + TO <- group_subset[x, 3] + } else { + end_SNP <- group_subset[x - 1, 1] + TO <- group_subset[x - 1, 3] + } + + final_table <- rbind.data.frame(final_table, final_table = data.frame( + "Group" = group_subset[x - 1, 5], + "Start_SNP" = Start_SNP, + "End_SNP" = end_SNP, + "chrom" = group_subset[x - 1, 2], + "nSNP" = snp_count, + "from" = snp_pos1, + "to" = TO, + "avg_pct" = sum_pct + )) + + # reset variable + snp_count <- 0 + sum_pct <- 0 + snp_pos1 <- group_subset[x, 3] + Start_SNP <- group_subset[x, 1] } - - final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5], - "Start_SNP"=Start_SNP, - "End_SNP"=end_SNP, - "chrom"=group_subset[x-1,2], - "nSNP"=snp_count, - "from"=snp_pos1, - "to"=TO, - "avg_pct"=sum_pct)) - - #reset variable - snp_count=0 - sum_pct=0 - snp_pos1=group_subset[x,3] - Start_SNP=group_subset[x,1] } - - #upgrade x value - x <- x+1 - } + + n <- n + 1 + setTxtProgressBar(pb, n) } - final_table$avg_pct = final_table$avg_pct/final_table$nSNP + close(pb) + + message("Calculation % SNP in ROH finish") # FILIPPO + + final_table$avg_pct <- final_table$avg_pct / final_table$nSNP + + row.names(final_table) <- seq(1, nrow(final_table)) - rownames(final_table) <- seq(1,length(row.names(final_table))) return(final_table) } - diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 28dfec8..39e34e1 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -23,8 +23,8 @@ included in the run and average percentage of SNPs in run are reported in the output dataframe) } \description{ -This function takes in input either the run results and returns a subset of -the runs most commonly found in the group/population. The parameter +This function takes in input either the run results and returns a subset of +the runs most commonly found in the group/population. The parameter \code{threshold} controls the definition of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) } @@ -36,12 +36,14 @@ mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detec # calculating runs of Homozygosity \dontrun{ # skipping runs calculation -runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15, -ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000) +runs <- slidingRUNS.run(genotypeFile, mapFile, + windowSize = 15, threshold = 0.1, minSNP = 15, + ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +) } # loading pre-calculated data -runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") -runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS') +runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) From 99cccaae24d72243ae181cf7c8b7ddb6403f61be Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 29 Mar 2022 11:06:57 +0200 Subject: [PATCH 04/23] :construction_worker: attempt to fix coverage visualization --- .github/workflows/test-coverage.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 4727d86..8b0479d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -29,6 +29,5 @@ jobs: working-directory: ./detectRUNS - name: Test coverage - run: covr::codecov() + run: covr::codecov(path="detectRUNS/") shell: Rscript {0} - working-directory: ./detectRUNS From 0a7a24b3031c255287f5edf9254d95770413d227 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 29 Mar 2022 11:43:07 +0200 Subject: [PATCH 05/23] :bug: change tableRUNS threshold check runs where SNP are major or equal the threshold specified --- detectRUNS/R/Stats.R | 10 ++++------ detectRUNS/tests/testthat/test.tableRuns.csv | 7 ++++--- detectRUNS/tests/testthat/test_Stats.R | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 7c82365..70bfcc1 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -17,7 +17,7 @@ chromosomeLength <- function(mapFile){ # read mapfile mappa <- readMapFile(mapFile) - + maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0 # defining NULL variables to avoid warning messages @@ -355,7 +355,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ message("Calculating SNPs inside ROH") names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") - + #read map file mappa <- readMapFile(mapFile) @@ -477,7 +477,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { for (grp in group_list) { # create subset for group/thresold - group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE > threshold_used, ] + group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ] # after filtering, I need to have at least 2 rows or I can't do the following stuff if (nrow(group_subset) <= 2) { @@ -495,14 +495,12 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { snp_count <- snp_count + 1 new_pos <- group_subset[x, 7] old_pos <- group_subset[x - 1, 7] - chr_old <- group_subset[x - 1, 2] - chr_new <- group_subset[x, 2] sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"] # this will be 1 in case of adjacent rows diff <- new_pos - old_pos - if ((diff > 1) | (chr_new != chr_old) | x == nrow(group_subset)) { + if ((diff > 1) | x == nrow(group_subset)) { if (x == nrow(group_subset)) { end_SNP <- group_subset[x, 1] TO <- group_subset[x, 3] diff --git a/detectRUNS/tests/testthat/test.tableRuns.csv b/detectRUNS/tests/testthat/test.tableRuns.csv index fa6d084..ed2d731 100644 --- a/detectRUNS/tests/testthat/test.tableRuns.csv +++ b/detectRUNS/tests/testthat/test.tableRuns.csv @@ -1,4 +1,5 @@ "Group";"Start_SNP";"End_SNP";"chrom";"nSNP";"from";"to";"avg_pct" -"Jacobs";"s07830.1";"s20481.1";"24";9;16041579;16811910;60 -"Jacobs";"s15456.1";"s30694.1";"24";12;31774063;32349389;60 -"Jacobs";"s59571.1";"s13600.1";"24";6;33056732;33397677;60 +"Jacobs";"s07830.1";"s46918.1";"24";17;16041579;17307132;55,2941176470588 +"Jacobs";"s52452.1";"OAR24_35715684.1";"24";19;31556086;32598869;56,3157894736842 +"Jacobs";"s37249.1";"OAR24_36594681.1";"24";15;32681863;33503397;54,6666666666667 +"Jacobs";"s62096.1";"OAR24_37414016.1";"24";8;33798780;34225118;50 diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index 7a9dcc0..eb47a79 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -13,9 +13,9 @@ test_that("Test tableRuns", { "integer", "integer", "numeric") reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) levels(reference$Group) <- c("Jacobs", "Navajo-Churro") - + runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) - + expect_equal(reference, test) }) From a6e37d45ceb59eef77abf34dcc54360af8e3ec22 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Wed, 30 Mar 2022 14:45:22 +0200 Subject: [PATCH 06/23] :bug: fix bug in tableRuns rewrite code to calculate percentages correctly --- detectRUNS/R/Stats.R | 65 ++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 70bfcc1..ab539c3 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -484,48 +484,67 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { break } + # message("Start from: ", paste0(group_subset[1, ], sep = " ")) + # initialize stuff - old_pos <- group_subset[1, 7] - snp_pos1 <- group_subset[1, 3] + old_idx <- group_subset[1, 7] + from <- group_subset[1, 3] Start_SNP <- group_subset[1, 1] - snp_count <- 0 - sum_pct <- 0 + snp_count <- 1 + sum_pct <- group_subset[1, "PERCENTAGE"] for (x in 2:nrow(group_subset)) { - snp_count <- snp_count + 1 - new_pos <- group_subset[x, 7] - old_pos <- group_subset[x - 1, 7] - sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"] + # message("Processing: ", paste0(group_subset[x, ], sep = " ")) + + # get current index + new_idx <- group_subset[x, 7] - # this will be 1 in case of adjacent rows - diff <- new_pos - old_pos + # Difference between indexes. This will be 1 in case of adjacent rows + diff <- new_idx - old_idx if ((diff > 1) | x == nrow(group_subset)) { if (x == nrow(group_subset)) { + # message("End of subset") end_SNP <- group_subset[x, 1] TO <- group_subset[x, 3] } else { + # message("End of segment") end_SNP <- group_subset[x - 1, 1] TO <- group_subset[x - 1, 3] } - final_table <- rbind.data.frame(final_table, final_table = data.frame( - "Group" = group_subset[x - 1, 5], - "Start_SNP" = Start_SNP, - "End_SNP" = end_SNP, - "chrom" = group_subset[x - 1, 2], - "nSNP" = snp_count, - "from" = snp_pos1, - "to" = TO, - "avg_pct" = sum_pct - )) + # throw away row if composed by only one SNP + if (snp_count > 1) { + final_table <- rbind.data.frame(final_table, final_table = data.frame( + "Group" = group_subset[x - 1, 5], + "Start_SNP" = Start_SNP, + "End_SNP" = end_SNP, + "chrom" = group_subset[x - 1, 2], + "nSNP" = snp_count, + "from" = from, + "to" = TO, + "avg_pct" = sum_pct + )) + + # message("Writing: ", paste0(tail(final_table, n = 1), sep = " ")) + # message("Start a new segment: ", paste0(group_subset[x, ], sep = " ")) + } # reset variable - snp_count <- 0 - sum_pct <- 0 - snp_pos1 <- group_subset[x, 3] + snp_count <- 1 + sum_pct <- group_subset[x, "PERCENTAGE"] + from <- group_subset[x, 3] Start_SNP <- group_subset[x, 1] + } else { + # update stuff + snp_count <- snp_count + 1 + sum_pct <- sum_pct + group_subset[x, "PERCENTAGE"] + + # message("Add snp to group: ", paste(snp_count, new_idx, old_idx, sum_pct)) } + + # update index + old_idx <- new_idx } } From 437619ed3d0cfdbafb0af8a80b6160138ea70ded Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Wed, 30 Mar 2022 22:31:09 +0200 Subject: [PATCH 07/23] :bug: fix bug in tableRuns filtering correctly skip empty subset --- detectRUNS/R/Stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index ab539c3..8ae2235 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -481,7 +481,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { # after filtering, I need to have at least 2 rows or I can't do the following stuff if (nrow(group_subset) <= 2) { - break + next } # message("Start from: ", paste0(group_subset[1, ], sep = " ")) From ca3d31e36a0bf04adab06242bc6f74544d6a4bfb Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 4 Apr 2022 12:04:19 +0200 Subject: [PATCH 08/23] :sparkles: start implementing tableRuns with Rcpp --- detectRUNS/DESCRIPTION | 4 +- detectRUNS/NAMESPACE | 1 + detectRUNS/R/RcppExports.R | 47 ++++++++++ detectRUNS/man/tableRunsCpp.Rd | 52 +++++++++++ detectRUNS/src/RcppExports.cpp | 15 +++ detectRUNS/src/functions.cpp | 166 +++++++++++++++++++++++++++++++++ 6 files changed, 283 insertions(+), 2 deletions(-) create mode 100644 detectRUNS/man/tableRunsCpp.Rd diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index 56220c4..e4c980d 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,8 +2,8 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9001 -Date: 2020-12-20 +Version: 0.9.6.9002 +Date: 2022-04-04 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), diff --git a/detectRUNS/NAMESPACE b/detectRUNS/NAMESPACE index 183f836..11039e4 100644 --- a/detectRUNS/NAMESPACE +++ b/detectRUNS/NAMESPACE @@ -16,6 +16,7 @@ export(reorderDF) export(slidingRUNS.run) export(summaryRuns) export(tableRuns) +export(tableRunsCpp) import(ggplot2) import(itertools) import(plyr) diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index b67a235..946d400 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -196,3 +196,50 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) } +#' Function to retrieve most common runs in the population +#' +#' This function takes in input either the run results and returns a subset of +#' the runs most commonly found in the group/population. The parameter +#' \code{threshold} controls the definition +#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) +#' +#' @param genotypeFile Plink ped file (for SNP position) +#' @param mapFile Plink map file (for SNP position) +#' @param runs R object (dataframe) with results on detected runs +#' @param threshold value from 0 to 1 (default 0.7) that controls the desired +#' proportion of individuals carrying that run (e.g. 70\%) +#' +#' @return A dataframe with the most common runs detected in the sampled individuals +#' (the group/population, start and end position of the run, chromosome, number of SNP +#' included in the run and average percentage of SNPs in run +#' are reported in the output dataframe) +#' @export +#' +#' @examples +#' # getting map and ped paths +#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") +#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") +#' +#' # calculating runs of Homozygosity +#' \dontrun{ +#' # skipping runs calculation +#' runs <- slidingRUNS.run(genotypeFile, mapFile, +#' windowSize = 15, threshold = 0.1, minSNP = 15, +#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +#' ) +#' } +#' # loading pre-calculated data +#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") +#' +#' table <- tableRunsCpp( +#' runs = runsDF, genotypeFile = genotypeFile, +#' mapFile = mapFile, threshold = 0.5) +#' +#' @useDynLib detectRUNS +#' @importFrom Rcpp sourceCpp +#' +tableRunsCpp <- function(runs, genotypeFile, mapFile, threshold = 0.5) { + .Call('_detectRUNS_tableRunsCpp', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold) +} + diff --git a/detectRUNS/man/tableRunsCpp.Rd b/detectRUNS/man/tableRunsCpp.Rd new file mode 100644 index 0000000..7672a3a --- /dev/null +++ b/detectRUNS/man/tableRunsCpp.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{tableRunsCpp} +\alias{tableRunsCpp} +\title{Function to retrieve most common runs in the population} +\usage{ +tableRunsCpp(runs, genotypeFile, mapFile, threshold = 0.5) +} +\arguments{ +\item{runs}{R object (dataframe) with results on detected runs} + +\item{genotypeFile}{Plink ped file (for SNP position)} + +\item{mapFile}{Plink map file (for SNP position)} + +\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired +proportion of individuals carrying that run (e.g. 70\%)} +} +\value{ +A dataframe with the most common runs detected in the sampled individuals +(the group/population, start and end position of the run, chromosome, number of SNP +included in the run and average percentage of SNPs in run +are reported in the output dataframe) +} +\description{ +This function takes in input either the run results and returns a subset of +the runs most commonly found in the group/population. The parameter +\code{threshold} controls the definition +of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) +} +\examples{ +# getting map and ped paths +genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") +mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") + +# calculating runs of Homozygosity +\dontrun{ +# skipping runs calculation +runs <- slidingRUNS.run(genotypeFile, mapFile, + windowSize = 15, threshold = 0.1, minSNP = 15, + ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +) +} +# loading pre-calculated data +runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") + +table <- tableRunsCpp( + runs = runsDF, genotypeFile = genotypeFile, + mapFile = mapFile, threshold = 0.5) + +} diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index e4721e6..862949d 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -159,6 +159,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// tableRunsCpp +DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); +RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< DataFrame >::type runs(runsSEXP); + Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); + Rcpp::traits::input_parameter< std::string >::type mapFile(mapFileSEXP); + Rcpp::traits::input_parameter< const float >::type threshold(thresholdSEXP); + rcpp_result_gen = Rcpp::wrap(tableRunsCpp(runs, genotypeFile, mapFile, threshold)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_fast_factor", (DL_FUNC) &_detectRUNS_fast_factor, 1}, @@ -172,6 +186,7 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, + {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 89b04ab..7279cde 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1013,3 +1013,169 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, // returning all runs for this individual genotype return(res); } + + +DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){ + CharacterVector population = runs[0]; + CharacterVector ind = runs[1]; + CharacterVector chromosomes = runs[2]; + IntegerVector count = runs[3]; + IntegerVector start = runs[4]; + IntegerVector end = runs[5]; + IntegerVector length = runs[6]; + + LogicalVector chrom_idx(chromosomes.size()); + + for (unsigned int j=0; j= threshold); + } + + return DataFrame::create( + Named("SNP_NAME") = snps[idx], + Named("CHR") = chromosomes[idx], + Named("POSITION") = positions[idx], + Named("COUNT") = counts[idx], + Named("BREED") = fast_factor(breeds[idx]), //returns a factor + Named("PERCENTAGE") = percentages[idx], + Named("Number") = numbers[idx], + _["stringsAsFactors"] = false); +} + + +//' Function to retrieve most common runs in the population +//' +//' This function takes in input either the run results and returns a subset of +//' the runs most commonly found in the group/population. The parameter +//' \code{threshold} controls the definition +//' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) +//' +//' @param genotypeFile Plink ped file (for SNP position) +//' @param mapFile Plink map file (for SNP position) +//' @param runs R object (dataframe) with results on detected runs +//' @param threshold value from 0 to 1 (default 0.7) that controls the desired +//' proportion of individuals carrying that run (e.g. 70\%) +//' +//' @return A dataframe with the most common runs detected in the sampled individuals +//' (the group/population, start and end position of the run, chromosome, number of SNP +//' included in the run and average percentage of SNPs in run +//' are reported in the output dataframe) +//' @export +//' +//' @examples +//' # getting map and ped paths +//' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") +//' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") +//' +//' # calculating runs of Homozygosity +//' \dontrun{ +//' # skipping runs calculation +//' runs <- slidingRUNS.run(genotypeFile, mapFile, +//' windowSize = 15, threshold = 0.1, minSNP = 15, +//' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +//' ) +//' } +//' # loading pre-calculated data +//' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +//' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") +//' +//' table <- tableRunsCpp( +//' runs = runsDF, genotypeFile = genotypeFile, +//' mapFile = mapFile, threshold = 0.5) +//' +//' @useDynLib detectRUNS +//' @importFrom Rcpp sourceCpp +//' +// [[Rcpp::export]] +DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, + const float threshold = 0.5) { + + // set a threshold + float threshold_used = threshold * 100; + Rprintf("Threshold used: %f", threshold_used); + + // Obtaining namespace of detectRUNS package + Environment pkg = Environment::namespace_env("detectRUNS"); + + // Picking up readMapFile function from detectRUNS package + Function readMapFile = pkg["readMapFile"]; + + // read mapfile using R function (which uses data.table::fread) + DataFrame mappa = readMapFile(mapFile); + + // get distinct chromosomes + CharacterVector chromosomes = runs[2]; + CharacterVector unique_chromosomes = sort_unique(chromosomes); + + DataFrame snpInsideRuns; + + for (unsigned int i=0; i(unique_chromosomes[i]); + + // extract the desidered chrom + DataFrame runsChrom = subset_runs_by_chrom(runs, chrom); + DataFrame mapKrom = subset_map_by_chrom(mappa, chrom); + + // calculate snpInsideRuns + snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); + + // add a column with the row names as integer + IntegerVector Number = seq(1, snpInsideRuns.nrows()); + snpInsideRuns.push_back(Number, "Number"); + + // now filter snpInsideRuns relying on threshold + snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); + + // debug + break; + } + + return snpInsideRuns; + +} From 7db05eb32358f2e820151f69c2b4f97add7b7004 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 4 Apr 2022 16:45:58 +0200 Subject: [PATCH 09/23] :sparkles: implement tableRuns in cpp --- detectRUNS/R/Stats.R | 4 +- detectRUNS/src/functions.cpp | 199 ++++++++++++++++++++++++++++++++--- 2 files changed, 185 insertions(+), 18 deletions(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 8ae2235..cdbf55e 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -480,7 +480,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ] # after filtering, I need to have at least 2 rows or I can't do the following stuff - if (nrow(group_subset) <= 2) { + if (nrow(group_subset) < 2) { next } @@ -527,10 +527,10 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { )) # message("Writing: ", paste0(tail(final_table, n = 1), sep = " ")) - # message("Start a new segment: ", paste0(group_subset[x, ], sep = " ")) } # reset variable + # message("Start a new segment: ", paste0(group_subset[x, ], sep = " ")) snp_count <- 1 sum_pct <- group_subset[x, "PERCENTAGE"] from <- group_subset[x, 3] diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 7279cde..fad9804 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1026,11 +1026,13 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){ LogicalVector chrom_idx(chromosomes.size()); + Rprintf("Got %d runs\n", runs.nrows()); + for (unsigned int j=0; j(chromosomes[j]) == chrom); } - return DataFrame::create( + DataFrame res = DataFrame::create( Named("CHR") = chromosomes[chrom_idx], Named("SNP_NAME") = snps[chrom_idx], Named("POSITION") = positions[chrom_idx] ); + + Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str()); + return res; } @@ -1073,11 +1083,13 @@ DataFrame filter_snpInsideRuns_by_threshold( LogicalVector idx(percentages.size()); + Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows()); + for (unsigned int j=0; j= threshold); } - return DataFrame::create( + DataFrame res = DataFrame::create( Named("SNP_NAME") = snps[idx], Named("CHR") = chromosomes[idx], Named("POSITION") = positions[idx], @@ -1086,9 +1098,39 @@ DataFrame filter_snpInsideRuns_by_threshold( Named("PERCENTAGE") = percentages[idx], Named("Number") = numbers[idx], _["stringsAsFactors"] = false); + + Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold); + return res; } +DataFrame filter_snpInsideRuns_by_breed( + DataFrame snpInsideRuns, std::string breed){ + CharacterVector snps = snpInsideRuns["SNP_NAME"]; + CharacterVector chromosomes = snpInsideRuns["CHR"]; + IntegerVector positions = snpInsideRuns["POSITION"]; + IntegerVector counts = snpInsideRuns["COUNT"]; + CharacterVector breeds = snpInsideRuns["BREED"]; + NumericVector percentages = snpInsideRuns["PERCENTAGE"]; + IntegerVector numbers = snpInsideRuns["Number"]; + + LogicalVector idx(breeds.size()); + + for (unsigned int j=0; j(unique_chromosomes[i]); - // extract the desidered chrom + Rprintf("Processing %d/%d chromosome\n", i+1, unique_chromosomes.size()); + + // extract the desired chrom DataFrame runsChrom = subset_runs_by_chrom(runs, chrom); DataFrame mapKrom = subset_map_by_chrom(mappa, chrom); // calculate snpInsideRuns - snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); + DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); // add a column with the row names as integer IntegerVector Number = seq(1, snpInsideRuns.nrows()); @@ -1172,10 +1227,122 @@ DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string map // now filter snpInsideRuns relying on threshold snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); - // debug - break; + // collect the distinct breeds + CharacterVector breeds = snpInsideRuns["BREED"]; + CharacterVector group_list = unique(breeds); + + for (unsigned int j=0; j(group_list[j]); + + Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str()); + + DataFrame group_subset = filter_snpInsideRuns_by_breed(snpInsideRuns, grp); + + // after filtering, I need to have at least 2 rows or I can't do the following stuff + if (group_subset.nrows() < 2) { + continue; + } + + // initialize stuff + IntegerVector idxs = group_subset["Number"]; + IntegerVector positions = group_subset["POSITION"]; + CharacterVector snp_names = group_subset["SNP_NAME"]; + NumericVector sum_pcts = group_subset["PERCENTAGE"]; + + int old_idx = idxs[0]; + int from = positions[0]; + String Start_SNP = snp_names[0]; + int snp_count = 1; + double sum_pct = sum_pcts[0]; + + Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from); + Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); + Rprintf("sum_pct: %f\n", sum_pct); + + for (unsigned int x=1; x(snp_names[x]).c_str()); + Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + + // get current index + int new_idx = idxs[x]; + + // Difference between indexes. This will be 1 in case of adjacent rows + int diff = new_idx - old_idx; + + // declare other variables + String end_SNP; + int TO; + + if ((diff > 1) | (x == group_subset.nrows()-1)) { + if (x == group_subset.nrows()-1) { + Rcout << "End of subset\n"; + end_SNP = snp_names[x]; + TO = positions[x]; + } else { + Rcout << "End of segment\n"; + end_SNP = snp_names[x-1]; + TO = positions[x-1]; + } + + // throw away row if composed by only one SNP + if (snp_count > 1) { + res_group.push_back(grp); + res_start_snp.push_back(Start_SNP); + res_end_snp.push_back(end_SNP); + res_chrom.push_back(chrom); + res_n_snp.push_back(snp_count); + res_from.push_back(from); + res_to.push_back(TO); + res_avg_pct.push_back(sum_pct); + + Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring()); + Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str()); + Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO); + Rprintf("sum_pct: %d\n", sum_pct); + } + + // reset variable + Rprintf("Start a new segment: %s, ", as(snp_names[x]).c_str()); + Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + + snp_count = 1; + sum_pct = sum_pcts[x]; + from = positions[x]; + Start_SNP = snp_names[x]; + } else { + // update stuff + snp_count++; + sum_pct += sum_pcts[x]; + + Rprintf("Add %s to group ", as(snp_names[x]).c_str()); + Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct); + } + + // update index + old_idx = new_idx; + } // cycle for snpInsideRuns + + } // cycle for group (breed) + + } // cycle for chromosome + + // calculate the avg percentage + for (unsigned int i=0; i Date: Mon, 4 Apr 2022 16:55:38 +0200 Subject: [PATCH 10/23] :white_check_mark: test tableRunsCpp --- detectRUNS/src/functions.cpp | 46 +++++++++++++------------- detectRUNS/tests/testthat/test_Stats.R | 11 ++++++ 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index fad9804..87fd31d 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1026,7 +1026,7 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){ LogicalVector chrom_idx(chromosomes.size()); - Rprintf("Got %d runs\n", runs.nrows()); + // Rprintf("Got %d runs\n", runs.nrows()); for (unsigned int j=0; j(chromosomes[j]) == chrom); @@ -1066,7 +1066,7 @@ DataFrame subset_map_by_chrom(DataFrame mappa, std::string chrom){ Named("POSITION") = positions[chrom_idx] ); - Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str()); + // Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str()); return res; } @@ -1083,7 +1083,7 @@ DataFrame filter_snpInsideRuns_by_threshold( LogicalVector idx(percentages.size()); - Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows()); + // Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows()); for (unsigned int j=0; j= threshold); @@ -1099,7 +1099,7 @@ DataFrame filter_snpInsideRuns_by_threshold( Named("Number") = numbers[idx], _["stringsAsFactors"] = false); - Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold); + // Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold); return res; } @@ -1234,7 +1234,7 @@ DataFrame tableRunsCpp( for (unsigned int j=0; j(group_list[j]); - Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str()); + // Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str()); DataFrame group_subset = filter_snpInsideRuns_by_breed(snpInsideRuns, grp); @@ -1255,13 +1255,13 @@ DataFrame tableRunsCpp( int snp_count = 1; double sum_pct = sum_pcts[0]; - Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from); - Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); - Rprintf("sum_pct: %f\n", sum_pct); + // Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from); + // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); + // Rprintf("sum_pct: %f\n", sum_pct); for (unsigned int x=1; x(snp_names[x]).c_str()); - Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + // Rprintf("Processing: %s, ", as(snp_names[x]).c_str()); + // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); // get current index int new_idx = idxs[x]; @@ -1275,11 +1275,11 @@ DataFrame tableRunsCpp( if ((diff > 1) | (x == group_subset.nrows()-1)) { if (x == group_subset.nrows()-1) { - Rcout << "End of subset\n"; + // Rcout << "End of subset\n"; end_SNP = snp_names[x]; TO = positions[x]; } else { - Rcout << "End of segment\n"; + // Rcout << "End of segment\n"; end_SNP = snp_names[x-1]; TO = positions[x-1]; } @@ -1295,15 +1295,15 @@ DataFrame tableRunsCpp( res_to.push_back(TO); res_avg_pct.push_back(sum_pct); - Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring()); - Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str()); - Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO); - Rprintf("sum_pct: %d\n", sum_pct); + // Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring()); + // Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str()); + // Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO); + // Rprintf("sum_pct: %d\n", sum_pct); } // reset variable - Rprintf("Start a new segment: %s, ", as(snp_names[x]).c_str()); - Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + // Rprintf("Start a new segment: %s, ", as(snp_names[x]).c_str()); + // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); snp_count = 1; sum_pct = sum_pcts[x]; @@ -1314,8 +1314,8 @@ DataFrame tableRunsCpp( snp_count++; sum_pct += sum_pcts[x]; - Rprintf("Add %s to group ", as(snp_names[x]).c_str()); - Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct); + // Rprintf("Add %s to group ", as(snp_names[x]).c_str()); + // Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct); } // update index @@ -1334,7 +1334,7 @@ DataFrame tableRunsCpp( Rcout << "Calculation % SNP in ROH finish\n"; return DataFrame::create( - Named("Group") = res_group, + Named("Group") = fast_factor(res_group), Named("Start_SNP") = res_start_snp, Named("End_SNP") = res_end_snp, Named("chrom") = res_chrom, diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index eb47a79..8864381 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -19,3 +19,14 @@ test_that("Test tableRuns", { expect_equal(reference, test) }) + +test_that("Test tableRunsCpp", { + colClasses <- c("factor", "character", "character", "character", "numeric", + "integer", "integer", "numeric") + reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) + + runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') + test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) + + expect_equal(reference, test) +}) From 3c7d23598568c6ed3ee578734f0b24d4bf5a1928 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 4 Apr 2022 17:39:28 +0200 Subject: [PATCH 11/23] :bug: skip tableRuns when no data available --- detectRUNS/src/functions.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 87fd31d..98b033a 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1211,15 +1211,34 @@ DataFrame tableRunsCpp( for (unsigned int i=0; i(unique_chromosomes[i]); - Rprintf("Processing %d/%d chromosome\n", i+1, unique_chromosomes.size()); + Rprintf( + "Processing chromosome '%s' (%d/%d)\n", + chrom.c_str(), + i+1, unique_chromosomes.size()); // extract the desired chrom DataFrame runsChrom = subset_runs_by_chrom(runs, chrom); + + if (runsChrom.nrows() == 0) { + REprintf("No runs on chromosome %s\n", chrom.c_str()); + continue; + } + DataFrame mapKrom = subset_map_by_chrom(mappa, chrom); + if (mapKrom.nrows() == 0) { + REprintf("No SNPs data for chromosome %s\n", chrom.c_str()); + continue; + } + // calculate snpInsideRuns DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); + if (snpInsideRuns.nrows() == 0) { + REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str()); + continue; + } + // add a column with the row names as integer IntegerVector Number = seq(1, snpInsideRuns.nrows()); snpInsideRuns.push_back(Number, "Number"); @@ -1227,6 +1246,13 @@ DataFrame tableRunsCpp( // now filter snpInsideRuns relying on threshold snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); + if (snpInsideRuns.nrows() == 0) { + REprintf( + "No SNPs in runs after filtering %f for chromosome %s\n", + threshold_used, chrom.c_str()); + continue; + } + // collect the distinct breeds CharacterVector breeds = snpInsideRuns["BREED"]; CharacterVector group_list = unique(breeds); From c78177a76564241bdbea31a513c27e1fba20ebfe Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Mon, 4 Apr 2022 17:47:12 +0200 Subject: [PATCH 12/23] :recycle: refactor snpInsideRunsCpp internal variables no more copy values from dataframe --- detectRUNS/src/functions.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 98b033a..aa1a401 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -921,10 +921,10 @@ void Runs::dumpRuns() { DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile) { - // transform R object in Cpp object - std::vector POSITIONS = as >(mapChrom["POSITION"]); - std::vector SNP_NAME = as >(mapChrom["SNP_NAME"]); - std::vector CHR = as >(mapChrom["CHR"]); + // get columns as vectors + IntegerVector POSITIONS = mapChrom["POSITION"]; + CharacterVector SNP_NAME = mapChrom["SNP_NAME"]; + CharacterVector CHR = mapChrom["CHR"]; // get unique breeds CharacterVector population = runsChrom["POPULATION"]; @@ -1212,7 +1212,7 @@ DataFrame tableRunsCpp( std::string chrom = as(unique_chromosomes[i]); Rprintf( - "Processing chromosome '%s' (%d/%d)\n", + "Processing chromosome '%s' (%d/%d)\n", chrom.c_str(), i+1, unique_chromosomes.size()); From 677fcad41898ace90371ee6fb8f4bfe9cd17030f Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 12:01:53 +0200 Subject: [PATCH 13/23] :recycle: filter tableRuns by threshold before evaluating breeds --- detectRUNS/R/Stats.R | 7 +++++-- detectRUNS/tests/testthat/test_Stats.R | 18 +++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index cdbf55e..0ae1597 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -475,9 +475,12 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { # add a column with the row names as integer snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns)) + # filter by threshold once + snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] + for (grp in group_list) { - # create subset for group/thresold - group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ] + # create subset for group + group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp), ] # after filtering, I need to have at least 2 rows or I can't do the following stuff if (nrow(group_subset) < 2) { diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index 8864381..fe6a2b6 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -3,29 +3,33 @@ library(detectRUNS) context("Testing Stats") # get file paths -genotypeFile <- "test.ped" +genotypeFile <- "test.ped" mapFile <- "test.map" rawFile <- "test.raw" runsFile <- "test.ROHet.consecutive.csv" test_that("Test tableRuns", { - colClasses <- c("factor", "character", "character", "character", "numeric", - "integer", "integer", "numeric") + colClasses <- c( + "factor", "character", "character", "character", "numeric", + "integer", "integer", "numeric" + ) reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) levels(reference$Group) <- c("Jacobs", "Navajo-Churro") - runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') + runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) expect_equal(reference, test) }) test_that("Test tableRunsCpp", { - colClasses <- c("factor", "character", "character", "character", "numeric", - "integer", "integer", "numeric") + colClasses <- c( + "factor", "character", "character", "character", "numeric", + "integer", "integer", "numeric" + ) reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) - runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS') + runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) expect_equal(reference, test) From 84eb6e6c2a1191b1cebefaa8c9b7ded4e1e548f0 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 12:22:29 +0200 Subject: [PATCH 14/23] :monocle_face: evaluate tableRuns performance in Cpp and R --- performance/test2.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 performance/test2.R diff --git a/performance/test2.R b/performance/test2.R new file mode 100644 index 0000000..51cf936 --- /dev/null +++ b/performance/test2.R @@ -0,0 +1,26 @@ + +# clean up +rm(list = ls()) + +# importing libraries +library(detectRUNS) +library(microbenchmark) + +genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") +mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") +runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") + +# read runs +runs <- readExternalRuns(runsFile, program = "detectRUNS") + +# how many times perform test +times <- 100 + +test_tableRuns <- microbenchmark( + R = tableRuns(runs, mapFile = mapFile, genotypeFile = genotypeFile), + Cpp = tableRunsCpp(runs, mapFile = mapFile, genotypeFile = genotypeFile), + unit = 'ms', + times = times +) + +boxplot(test_tableRuns) From d05ef4de01eba5996a0d78411e59159135dd5a56 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 14:56:18 +0200 Subject: [PATCH 15/23] :poop: duplicate snpInsideRunsCpp to ensure backward compatibility --- detectRUNS/R/RcppExports.R | 20 +++++ detectRUNS/R/Stats.R | 5 +- detectRUNS/man/countSnpInRunsCpp.Rd | 22 +++++ detectRUNS/src/RcppExports.cpp | 14 +++ detectRUNS/src/functions.cpp | 129 ++++++++++++++++++++++++++-- 5 files changed, 180 insertions(+), 10 deletions(-) create mode 100644 detectRUNS/man/countSnpInRunsCpp.Rd diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index 946d400..03466ec 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -196,6 +196,26 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) } +#' Function to count number of times a SNP is in a RUN +#' +#' Similar to snpInsideRunsCpp, it apply additional column to dataframe. +#' Require to ensure backward compatibility. +#' +#' @param runsChrom R object (dataframe) with results per chromosome +#' @param mapChrom R map object with SNP per chromosome +#' @param genotypeFile genotype (.ped) file location +#' +#' @return dataframe with counts per SNP in runs (per population) +#' +#' @import utils +#' +#' @useDynLib detectRUNS +#' @importFrom Rcpp sourceCpp +#' +countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { + .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) +} + #' Function to retrieve most common runs in the population #' #' This function takes in input either the run results and returns a subset of diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 0ae1597..9e740ce 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -470,10 +470,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { mapKrom <- mappa[mappa$CHR == chrom, ] # calculate snpInsideRuns - snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile) - - # add a column with the row names as integer - snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns)) + snpInsideRuns <- countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile) # filter by threshold once snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd new file mode 100644 index 0000000..56780ed --- /dev/null +++ b/detectRUNS/man/countSnpInRunsCpp.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{countSnpInRunsCpp} +\alias{countSnpInRunsCpp} +\title{Function to count number of times a SNP is in a RUN} +\usage{ +countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile) +} +\arguments{ +\item{runsChrom}{R object (dataframe) with results per chromosome} + +\item{mapChrom}{R map object with SNP per chromosome} + +\item{genotypeFile}{genotype (.ped) file location} +} +\value{ +dataframe with counts per SNP in runs (per population) +} +\description{ +Similar to snpInsideRunsCpp, it apply additional column to dataframe. +Require to ensure backward compatibility. +} diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index 862949d..b4252f5 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -159,6 +159,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// countSnpInRunsCpp +DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile); +RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP); + Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP); + Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); + rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile)); + return rcpp_result_gen; +END_RCPP +} // tableRunsCpp DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { @@ -186,6 +199,7 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, + {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 3}, {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index aa1a401..62960fe 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1015,6 +1015,127 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, } +//' Function to count number of times a SNP is in a RUN +//' +//' Similar to snpInsideRunsCpp, it apply additional column to dataframe. +//' Require to ensure backward compatibility. +//' +//' @param runsChrom R object (dataframe) with results per chromosome +//' @param mapChrom R map object with SNP per chromosome +//' @param genotypeFile genotype (.ped) file location +//' +//' @return dataframe with counts per SNP in runs (per population) +//' +//' @import utils +//' +//' @useDynLib detectRUNS +//' @importFrom Rcpp sourceCpp +//' +// [[Rcpp::export]] +DataFrame countSnpInRunsCpp( + DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile) { + + // get columns as vectors + IntegerVector POSITIONS = mapChrom["POSITION"]; + CharacterVector SNP_NAME = mapChrom["SNP_NAME"]; + CharacterVector CHR = mapChrom["CHR"]; + + // get unique breeds + CharacterVector population = runsChrom["POPULATION"]; + std::vector unique_breeds = as >(unique(population)); + + // sort unique breeds + std::sort(unique_breeds.begin(), unique_breeds.end()); + + // declare others variables + std::string ras; + int pos, index, map_size = SNP_NAME.size(); + std::map snpCounts, nBreeds ; + + // define result size like n SNPs * unique_breeds + int result_size = map_size * unique_breeds.size(); + + // the columns of data.frame Defining data types accordingly slinding window + CharacterVector snp_name(result_size); + CharacterVector chrom(result_size); + IntegerVector position(result_size); + IntegerVector count(result_size); + CharacterVector breed(result_size); + NumericVector percentage(result_size); + IntegerVector number(result_size); + + // get all populations + DataFrame pops = readPOPCpp(genotypeFile); + CharacterVector pop = pops["POP"]; + + // instantiate a Runs object + Runs runs(runsChrom); + + // debug: dump object + // runs.dumpRuns(); + + // cicle among single breeds and find breed numbers + for (unsigned int i=0; i(snp_names[x]).c_str()); // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); From ef971b6a410f44e928d736062d292bed17a99f86 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 15:27:09 +0200 Subject: [PATCH 16/23] :zap: no more pre-allocate memory in countSnpInRunsCpp --- detectRUNS/DESCRIPTION | 2 +- detectRUNS/R/RcppExports.R | 5 +- detectRUNS/R/Stats.R | 8 ++- detectRUNS/man/countSnpInRunsCpp.Rd | 4 +- detectRUNS/src/RcppExports.cpp | 9 ++-- detectRUNS/src/functions.cpp | 69 +++++++++++--------------- detectRUNS/tests/testthat/test_Stats.R | 1 - 7 files changed, 44 insertions(+), 54 deletions(-) diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index e4c980d..1099bda 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,7 +2,7 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9002 +Version: 0.9.6.9003 Date: 2022-04-04 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index 03466ec..684a2f4 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -204,6 +204,7 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { #' @param runsChrom R object (dataframe) with results per chromosome #' @param mapChrom R map object with SNP per chromosome #' @param genotypeFile genotype (.ped) file location +#' @param threshold filter out SNPs below this value #' #' @return dataframe with counts per SNP in runs (per population) #' @@ -212,8 +213,8 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { #' @useDynLib detectRUNS #' @importFrom Rcpp sourceCpp #' -countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { - .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) +countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile, threshold = 50) { + .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile, threshold) } #' Function to retrieve most common runs in the population diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 9e740ce..626e604 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -469,11 +469,9 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { runsChrom <- runs[runs$CHROMOSOME == chrom, ] mapKrom <- mappa[mappa$CHR == chrom, ] - # calculate snpInsideRuns - snpInsideRuns <- countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile) - - # filter by threshold once - snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] + # calculate snpInsideRuns with threshold + snpInsideRuns <- countSnpInRunsCpp( + runsChrom, mapKrom, genotypeFile, threshold_used) for (grp in group_list) { # create subset for group diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd index 56780ed..f48c341 100644 --- a/detectRUNS/man/countSnpInRunsCpp.Rd +++ b/detectRUNS/man/countSnpInRunsCpp.Rd @@ -4,7 +4,7 @@ \alias{countSnpInRunsCpp} \title{Function to count number of times a SNP is in a RUN} \usage{ -countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile) +countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold = 50) } \arguments{ \item{runsChrom}{R object (dataframe) with results per chromosome} @@ -12,6 +12,8 @@ countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile) \item{mapChrom}{R map object with SNP per chromosome} \item{genotypeFile}{genotype (.ped) file location} + +\item{threshold}{filter out SNPs below this value} } \value{ dataframe with counts per SNP in runs (per population) diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index b4252f5..14f87ea 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -160,15 +160,16 @@ BEGIN_RCPP END_RCPP } // countSnpInRunsCpp -DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile); -RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) { +DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, float threshold); +RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP, SEXP thresholdSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP); Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP); Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); - rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile)); + Rcpp::traits::input_parameter< float >::type threshold(thresholdSEXP); + rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold)); return rcpp_result_gen; END_RCPP } @@ -199,7 +200,7 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, - {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 3}, + {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 4}, {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 62960fe..a9fffae 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1023,6 +1023,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, //' @param runsChrom R object (dataframe) with results per chromosome //' @param mapChrom R map object with SNP per chromosome //' @param genotypeFile genotype (.ped) file location +//' @param threshold filter out SNPs below this value //' //' @return dataframe with counts per SNP in runs (per population) //' @@ -1033,7 +1034,8 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, //' // [[Rcpp::export]] DataFrame countSnpInRunsCpp( - DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile) { + DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, + float threshold = 50) { // get columns as vectors IntegerVector POSITIONS = mapChrom["POSITION"]; @@ -1049,20 +1051,17 @@ DataFrame countSnpInRunsCpp( // declare others variables std::string ras; - int pos, index, map_size = SNP_NAME.size(); + int pos; float avg_perc; std::map snpCounts, nBreeds ; - // define result size like n SNPs * unique_breeds - int result_size = map_size * unique_breeds.size(); - // the columns of data.frame Defining data types accordingly slinding window - CharacterVector snp_name(result_size); - CharacterVector chrom(result_size); - IntegerVector position(result_size); - IntegerVector count(result_size); - CharacterVector breed(result_size); - NumericVector percentage(result_size); - IntegerVector number(result_size); + CharacterVector snp_name; + CharacterVector chrom; + IntegerVector position; + IntegerVector count; + CharacterVector breed; + NumericVector percentage; + IntegerVector number; // get all populations DataFrame pops = readPOPCpp(genotypeFile); @@ -1096,25 +1095,21 @@ DataFrame countSnpInRunsCpp( // get a breed ras = unique_breeds[i]; - //calculating results index - index = i * map_size + j; + // calculate percentage + avg_perc = double(snpCounts[ras])/nBreeds[ras]*100; - // update values - number[index] = j+1; - count[index] = snpCounts[ras]; - breed[index] = ras; - percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100; + if (avg_perc >= threshold) { + // track SNP + number.push_back(j+1); + count.push_back(snpCounts[ras]); + breed.push_back(ras); + percentage.push_back(avg_perc); - // debug - // if (SNP_NAME[j] == "OAR24_6970428.1") { - // Rcout << "Index i: " << i << " Index j: " << j << " Final index: " << index; - // Rcout << " Breed: " << ras << " Count: " << snpCounts[ras] << std::endl; - // } - - // read data from mapChrom - snp_name[index] = SNP_NAME[j]; - chrom[index] = CHR[j]; - position[index] = pos; + // read data from mapChrom + snp_name.push_back(SNP_NAME[j]); + chrom.push_back(CHR[j]); + position.push_back(pos); + } } // cicle for snp position @@ -1353,20 +1348,14 @@ DataFrame tableRunsCpp( } // calculate snpInsideRuns - DataFrame snpInsideRuns = countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile); - - if (snpInsideRuns.nrows() == 0) { - REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str()); - continue; - } - - // now filter snpInsideRuns relying on threshold - snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); + DataFrame snpInsideRuns = countSnpInRunsCpp( + runsChrom, mapKrom, genotypeFile, threshold_used); if (snpInsideRuns.nrows() == 0) { REprintf( - "No SNPs in runs after filtering %f for chromosome %s\n", - threshold_used, chrom.c_str()); + "No SNPs in runs for chromosome %s with threshold %f\n", + chrom.c_str(), + threshold); continue; } diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index fe6a2b6..0ec0fdc 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -14,7 +14,6 @@ test_that("Test tableRuns", { "integer", "integer", "numeric" ) reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) - levels(reference$Group) <- c("Jacobs", "Navajo-Churro") runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) From 8d081e3a86a6ba72b2a414fddae5110b7350d18a Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 15:44:36 +0200 Subject: [PATCH 17/23] :rewind: revert countSnpInRunsCpp related code --- detectRUNS/DESCRIPTION | 2 +- detectRUNS/R/RcppExports.R | 21 ---- detectRUNS/R/Stats.R | 11 +- detectRUNS/man/countSnpInRunsCpp.Rd | 24 ----- detectRUNS/src/RcppExports.cpp | 15 --- detectRUNS/src/functions.cpp | 138 +++---------------------- detectRUNS/tests/testthat/test_Stats.R | 1 + 7 files changed, 26 insertions(+), 186 deletions(-) delete mode 100644 detectRUNS/man/countSnpInRunsCpp.Rd diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index 1099bda..e4c980d 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,7 +2,7 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9003 +Version: 0.9.6.9002 Date: 2022-04-04 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index 684a2f4..946d400 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -196,27 +196,6 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) } -#' Function to count number of times a SNP is in a RUN -#' -#' Similar to snpInsideRunsCpp, it apply additional column to dataframe. -#' Require to ensure backward compatibility. -#' -#' @param runsChrom R object (dataframe) with results per chromosome -#' @param mapChrom R map object with SNP per chromosome -#' @param genotypeFile genotype (.ped) file location -#' @param threshold filter out SNPs below this value -#' -#' @return dataframe with counts per SNP in runs (per population) -#' -#' @import utils -#' -#' @useDynLib detectRUNS -#' @importFrom Rcpp sourceCpp -#' -countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile, threshold = 50) { - .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile, threshold) -} - #' Function to retrieve most common runs in the population #' #' This function takes in input either the run results and returns a subset of diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 626e604..0ae1597 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -469,9 +469,14 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { runsChrom <- runs[runs$CHROMOSOME == chrom, ] mapKrom <- mappa[mappa$CHR == chrom, ] - # calculate snpInsideRuns with threshold - snpInsideRuns <- countSnpInRunsCpp( - runsChrom, mapKrom, genotypeFile, threshold_used) + # calculate snpInsideRuns + snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile) + + # add a column with the row names as integer + snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns)) + + # filter by threshold once + snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] for (grp in group_list) { # create subset for group diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd deleted file mode 100644 index f48c341..0000000 --- a/detectRUNS/man/countSnpInRunsCpp.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{countSnpInRunsCpp} -\alias{countSnpInRunsCpp} -\title{Function to count number of times a SNP is in a RUN} -\usage{ -countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold = 50) -} -\arguments{ -\item{runsChrom}{R object (dataframe) with results per chromosome} - -\item{mapChrom}{R map object with SNP per chromosome} - -\item{genotypeFile}{genotype (.ped) file location} - -\item{threshold}{filter out SNPs below this value} -} -\value{ -dataframe with counts per SNP in runs (per population) -} -\description{ -Similar to snpInsideRunsCpp, it apply additional column to dataframe. -Require to ensure backward compatibility. -} diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index 14f87ea..862949d 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -159,20 +159,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// countSnpInRunsCpp -DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, float threshold); -RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP, SEXP thresholdSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP); - Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP); - Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); - Rcpp::traits::input_parameter< float >::type threshold(thresholdSEXP); - rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold)); - return rcpp_result_gen; -END_RCPP -} // tableRunsCpp DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { @@ -200,7 +186,6 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, - {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 4}, {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index a9fffae..aa1a401 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1015,122 +1015,6 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, } -//' Function to count number of times a SNP is in a RUN -//' -//' Similar to snpInsideRunsCpp, it apply additional column to dataframe. -//' Require to ensure backward compatibility. -//' -//' @param runsChrom R object (dataframe) with results per chromosome -//' @param mapChrom R map object with SNP per chromosome -//' @param genotypeFile genotype (.ped) file location -//' @param threshold filter out SNPs below this value -//' -//' @return dataframe with counts per SNP in runs (per population) -//' -//' @import utils -//' -//' @useDynLib detectRUNS -//' @importFrom Rcpp sourceCpp -//' -// [[Rcpp::export]] -DataFrame countSnpInRunsCpp( - DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, - float threshold = 50) { - - // get columns as vectors - IntegerVector POSITIONS = mapChrom["POSITION"]; - CharacterVector SNP_NAME = mapChrom["SNP_NAME"]; - CharacterVector CHR = mapChrom["CHR"]; - - // get unique breeds - CharacterVector population = runsChrom["POPULATION"]; - std::vector unique_breeds = as >(unique(population)); - - // sort unique breeds - std::sort(unique_breeds.begin(), unique_breeds.end()); - - // declare others variables - std::string ras; - int pos; float avg_perc; - std::map snpCounts, nBreeds ; - - // the columns of data.frame Defining data types accordingly slinding window - CharacterVector snp_name; - CharacterVector chrom; - IntegerVector position; - IntegerVector count; - CharacterVector breed; - NumericVector percentage; - IntegerVector number; - - // get all populations - DataFrame pops = readPOPCpp(genotypeFile); - CharacterVector pop = pops["POP"]; - - // instantiate a Runs object - Runs runs(runsChrom); - - // debug: dump object - // runs.dumpRuns(); - - // cicle among single breeds and find breed numbers - for (unsigned int i=0; i= threshold) { - // track SNP - number.push_back(j+1); - count.push_back(snpCounts[ras]); - breed.push_back(ras); - percentage.push_back(avg_perc); - - // read data from mapChrom - snp_name.push_back(SNP_NAME[j]); - chrom.push_back(CHR[j]); - position.push_back(pos); - } - - } // cicle for snp position - - } // cicle for breed - - // initialize dataframe of results. - DataFrame res = DataFrame::create( - Named("SNP_NAME") = snp_name, - Named("CHR") = chrom, - Named("POSITION") = position, - Named("COUNT") = count, - Named("BREED") = fast_factor(breed), //returns a factor - Named("PERCENTAGE") = percentage, - Named("Number") = number, - _["stringsAsFactors"] = false); - - // returning all runs for this individual genotype - return(res); -} - - DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){ CharacterVector population = runs[0]; CharacterVector ind = runs[1]; @@ -1348,14 +1232,24 @@ DataFrame tableRunsCpp( } // calculate snpInsideRuns - DataFrame snpInsideRuns = countSnpInRunsCpp( - runsChrom, mapKrom, genotypeFile, threshold_used); + DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); + + if (snpInsideRuns.nrows() == 0) { + REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str()); + continue; + } + + // add a column with the row names as integer + IntegerVector Number = seq(1, snpInsideRuns.nrows()); + snpInsideRuns.push_back(Number, "Number"); + + // now filter snpInsideRuns relying on threshold + snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); if (snpInsideRuns.nrows() == 0) { REprintf( - "No SNPs in runs for chromosome %s with threshold %f\n", - chrom.c_str(), - threshold); + "No SNPs in runs after filtering %f for chromosome %s\n", + threshold_used, chrom.c_str()); continue; } @@ -1391,7 +1285,7 @@ DataFrame tableRunsCpp( // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); // Rprintf("sum_pct: %f\n", sum_pct); - for (int x=1; x(snp_names[x]).c_str()); // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index 0ec0fdc..fe6a2b6 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -14,6 +14,7 @@ test_that("Test tableRuns", { "integer", "integer", "numeric" ) reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) + levels(reference$Group) <- c("Jacobs", "Navajo-Churro") runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) From 43614e6a6b4fbe9901d80d41d1bed97b6a8cb895 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 16:14:08 +0200 Subject: [PATCH 18/23] :recycle: add Number column to snpInsideRunsCpp result --- detectRUNS/DESCRIPTION | 4 +-- detectRUNS/R/Stats.R | 7 ++--- detectRUNS/R/plots.R | 8 ++++++ detectRUNS/src/functions.cpp | 19 ++++++++------ detectRUNS/tests/testthat/test_functions.R | 30 +++++++++++++--------- 5 files changed, 43 insertions(+), 25 deletions(-) diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index e4c980d..efcfced 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,8 +2,8 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9002 -Date: 2022-04-04 +Version: 0.9.6.9003 +Date: 2022-04-05 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 0ae1597..d14da1c 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -379,6 +379,10 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ runsChrom <- runs[runs$CHROMOSOME==chrom,] mapKrom <- mappa[mappa$CHR==chrom,] snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) + + # remove Number column + snpInRuns$Number <- NULL + all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) n=n+1 setTxtProgressBar(pb, n) @@ -472,9 +476,6 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { # calculate snpInsideRuns snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile) - # add a column with the row names as integer - snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns)) - # filter by threshold once snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R index 0ac2db9..0b865c4 100644 --- a/detectRUNS/R/plots.R +++ b/detectRUNS/R/plots.R @@ -323,6 +323,10 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa print(paste("N.of SNP is",nrow(mapChrom))) snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + + # remove Number column + snpInRuns$Number <- NULL + krom <- subset(snpInRuns,CHR==chromosome) p <- ggplot(data=krom, aes(x=POSITION/(10^6), y=PERCENTAGE, colour=BREED)) @@ -433,6 +437,10 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33, runsChrom <- runs[runs$CHROMOSOME==chrom,] mapChrom <- mappa[mappa$CHR==chrom,] snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile) + + # remove Number column + snpInRuns$Number <- NULL + all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) n=n+1 setTxtProgressBar(pb, n) diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index aa1a401..554c6d9 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -948,6 +948,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, IntegerVector count(result_size); CharacterVector breed(result_size); NumericVector percentage(result_size); + IntegerVector number(result_size); // get all populations DataFrame pops = readPOPCpp(genotypeFile); @@ -988,6 +989,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, count[index] = snpCounts[ras]; breed[index] = ras; percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100; + number[index] = j+1; // debug // if (SNP_NAME[j] == "OAR24_6970428.1") { @@ -1006,9 +1008,14 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, // initialize dataframe of results. DataFrame res = DataFrame::create( - Named("SNP_NAME")=snp_name, Named("CHR")=chrom, Named("POSITION")=position, - Named("COUNT")=count, Named("BREED")=fast_factor(breed) , //returns a factor - Named("PERCENTAGE")=percentage, _["stringsAsFactors"] = false); + Named("SNP_NAME") = snp_name, + Named("CHR") = chrom, + Named("POSITION") = position, + Named("COUNT") = count, + Named("BREED") = fast_factor(breed), //returns a factor + Named("PERCENTAGE") = percentage, + Named("Number") = number, + _["stringsAsFactors"] = false); // returning all runs for this individual genotype return(res); @@ -1239,10 +1246,6 @@ DataFrame tableRunsCpp( continue; } - // add a column with the row names as integer - IntegerVector Number = seq(1, snpInsideRuns.nrows()); - snpInsideRuns.push_back(Number, "Number"); - // now filter snpInsideRuns relying on threshold snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); @@ -1285,7 +1288,7 @@ DataFrame tableRunsCpp( // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); // Rprintf("sum_pct: %f\n", sum_pct); - for (unsigned int x=1; x(snp_names[x]).c_str()); // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); diff --git a/detectRUNS/tests/testthat/test_functions.R b/detectRUNS/tests/testthat/test_functions.R index 0b45482..40833b4 100644 --- a/detectRUNS/tests/testthat/test_functions.R +++ b/detectRUNS/tests/testthat/test_functions.R @@ -26,10 +26,10 @@ test_that("Test readMapFile", { reference <- data.table::fread(mapFile, header = FALSE, colClasses = colClasses) names(reference) <- c("CHR","SNP_NAME","x","POSITION") reference$x <- NULL - + test <- readMapFile(mapFile) expect_equal(reference, test) - + # test for a non existing file expect_error(readMapFile("does_not_exists.map"), "doesn't exists") }) @@ -845,7 +845,7 @@ test_that("Testing consecutiveRunsCpp", { test_that("Testing snpInsideRuns", { # read mapfile with custom methods mappa <- readMapFile(mapFile) - + # this is the chromosome I want to test chrom <- "24" @@ -868,6 +868,9 @@ test_that("Testing snpInsideRuns", { reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile) test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + # remove Number column from test + test$Number <- NULL + # testing functions expect_equivalent(test, reference) @@ -876,35 +879,38 @@ test_that("Testing snpInsideRuns", { test_that("Testing snpInsideRuns with CHR as strings", { # read mapfile with custom methods mappa <- readMapFile(mapFile) - + # this is the chromosome I want to test chrom <- "X" - + # replacing chromsomes with a string value (which cannot be converted as int) mappa$CHR <- chrom - + # subsetting mapChrom (get x random snps from mapfile) set.seed(42) mapChrom <- mappa[mappa$CHR==chrom, ] mapChrom <- mapChrom[sort(sample(nrow(mapChrom), 10)), ] - + # loading pre-calculated data runsFile <- "test.ROHet.sliding.csv" colClasses <- c(rep("character", 3), rep("numeric", 4) ) runs <- read.csv2(runsFile, header = TRUE, stringsAsFactors = FALSE, colClasses = colClasses) - + # replacing chromsomes with a string value (which cannot be converted as int) runs$chrom <- chrom - + # fix column names and define runsChrom names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") runsChrom <- runs[runs$CHROMOSOME==chrom, ] - + # get snps inside runs reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile) test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) - + + # remove Number column from test + test$Number <- NULL + # testing functions expect_equivalent(test, reference) -}) \ No newline at end of file +}) From ab4d958377e9ab9fc27db7346095811f3ccf433a Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Tue, 5 Apr 2022 16:45:15 +0200 Subject: [PATCH 19/23] :fire: remove tableRuns form R code --- detectRUNS/NAMESPACE | 1 - detectRUNS/R/RcppExports.R | 6 +- detectRUNS/R/Stats.R | 169 ------------------------- detectRUNS/man/tableRuns.Rd | 8 +- detectRUNS/man/tableRunsCpp.Rd | 52 -------- detectRUNS/src/RcppExports.cpp | 10 +- detectRUNS/src/functions.cpp | 4 +- detectRUNS/tests/testthat/test_Stats.R | 14 -- 8 files changed, 15 insertions(+), 249 deletions(-) delete mode 100644 detectRUNS/man/tableRunsCpp.Rd diff --git a/detectRUNS/NAMESPACE b/detectRUNS/NAMESPACE index 11039e4..183f836 100644 --- a/detectRUNS/NAMESPACE +++ b/detectRUNS/NAMESPACE @@ -16,7 +16,6 @@ export(reorderDF) export(slidingRUNS.run) export(summaryRuns) export(tableRuns) -export(tableRunsCpp) import(ggplot2) import(itertools) import(plyr) diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index 946d400..ecdf88d 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -232,14 +232,14 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { #' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") #' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") #' -#' table <- tableRunsCpp( +#' table <- tableRuns( #' runs = runsDF, genotypeFile = genotypeFile, #' mapFile = mapFile, threshold = 0.5) #' #' @useDynLib detectRUNS #' @importFrom Rcpp sourceCpp #' -tableRunsCpp <- function(runs, genotypeFile, mapFile, threshold = 0.5) { - .Call('_detectRUNS_tableRunsCpp', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold) +tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) { + .Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold) } diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index d14da1c..343df49 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -397,172 +397,3 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ return(result_summary) } - - -#' Function to retrieve most common runs in the population -#' -#' This function takes in input either the run results and returns a subset of -#' the runs most commonly found in the group/population. The parameter -#' \code{threshold} controls the definition -#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) -#' -#' @param genotypeFile Plink ped file (for SNP position) -#' @param mapFile Plink map file (for SNP position) -#' @param runs R object (dataframe) with results on detected runs -#' @param threshold value from 0 to 1 (default 0.7) that controls the desired -#' proportion of individuals carrying that run (e.g. 70\%) -#' -#' @return A dataframe with the most common runs detected in the sampled individuals -#' (the group/population, start and end position of the run, chromosome, number of SNP -#' included in the run and average percentage of SNPs in run -#' are reported in the output dataframe) -#' @export -#' -#' @examples -#' # getting map and ped paths -#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") -#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") -#' -#' # calculating runs of Homozygosity -#' \dontrun{ -#' # skipping runs calculation -#' runs <- slidingRUNS.run(genotypeFile, mapFile, -#' windowSize = 15, threshold = 0.1, minSNP = 15, -#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 -#' ) -#' } -#' # loading pre-calculated data -#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") -#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") -#' -#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) -#' -tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) { - - # set a threshold - threshold_used <- threshold * 100 - message(paste("Threshold used:", threshold_used)) - - # read map file - mappa <- readMapFile(mapFile) - - # change colnames in runs file - names(runs) <- c("POPULATION", "IND", "CHROMOSOME", "COUNT", "START", "END", "LENGTH") - - # vector of breeds - group_list <- as.vector(unique(runs$POPULATION)) - - # final data frame - final_table <- data.frame( - "GROUP" = character(0), "Start_SNP" = character(0), "End_SNP" = character(0), - "chrom" = character(0), "nSNP" = integer(0), "from" = integer(0), "to" = integer(0), "avg_pct" = numeric(0) - ) - - # Start calculation % SNP in ROH - message("Calculation % SNP in ROH") # FILIPPO - - # create progress bar - total <- length(unique(runs$CHROMOSOME)) - message(paste("Chromosome founds: ", total)) # FILIPPO - n <- 0 - pb <- txtProgressBar(min = 0, max = total, style = 3) - - # SNP in ROH - for (chrom in sort(unique(runs$CHROMOSOME))) { - # extract from map and runs only the chromosome I need - runsChrom <- runs[runs$CHROMOSOME == chrom, ] - mapKrom <- mappa[mappa$CHR == chrom, ] - - # calculate snpInsideRuns - snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile) - - # filter by threshold once - snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ] - - for (grp in group_list) { - # create subset for group - group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp), ] - - # after filtering, I need to have at least 2 rows or I can't do the following stuff - if (nrow(group_subset) < 2) { - next - } - - # message("Start from: ", paste0(group_subset[1, ], sep = " ")) - - # initialize stuff - old_idx <- group_subset[1, 7] - from <- group_subset[1, 3] - Start_SNP <- group_subset[1, 1] - snp_count <- 1 - sum_pct <- group_subset[1, "PERCENTAGE"] - - for (x in 2:nrow(group_subset)) { - # message("Processing: ", paste0(group_subset[x, ], sep = " ")) - - # get current index - new_idx <- group_subset[x, 7] - - # Difference between indexes. This will be 1 in case of adjacent rows - diff <- new_idx - old_idx - - if ((diff > 1) | x == nrow(group_subset)) { - if (x == nrow(group_subset)) { - # message("End of subset") - end_SNP <- group_subset[x, 1] - TO <- group_subset[x, 3] - } else { - # message("End of segment") - end_SNP <- group_subset[x - 1, 1] - TO <- group_subset[x - 1, 3] - } - - # throw away row if composed by only one SNP - if (snp_count > 1) { - final_table <- rbind.data.frame(final_table, final_table = data.frame( - "Group" = group_subset[x - 1, 5], - "Start_SNP" = Start_SNP, - "End_SNP" = end_SNP, - "chrom" = group_subset[x - 1, 2], - "nSNP" = snp_count, - "from" = from, - "to" = TO, - "avg_pct" = sum_pct - )) - - # message("Writing: ", paste0(tail(final_table, n = 1), sep = " ")) - } - - # reset variable - # message("Start a new segment: ", paste0(group_subset[x, ], sep = " ")) - snp_count <- 1 - sum_pct <- group_subset[x, "PERCENTAGE"] - from <- group_subset[x, 3] - Start_SNP <- group_subset[x, 1] - } else { - # update stuff - snp_count <- snp_count + 1 - sum_pct <- sum_pct + group_subset[x, "PERCENTAGE"] - - # message("Add snp to group: ", paste(snp_count, new_idx, old_idx, sum_pct)) - } - - # update index - old_idx <- new_idx - } - } - - n <- n + 1 - setTxtProgressBar(pb, n) - } - - close(pb) - - message("Calculation % SNP in ROH finish") # FILIPPO - - final_table$avg_pct <- final_table$avg_pct / final_table$nSNP - - row.names(final_table) <- seq(1, nrow(final_table)) - - return(final_table) -} diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 39e34e1..8c8bdb6 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Stats.R +% Please edit documentation in R/RcppExports.R \name{tableRuns} \alias{tableRuns} \title{Function to retrieve most common runs in the population} \usage{ -tableRuns(runs = NULL, genotypeFile, mapFile, threshold = 0.5) +tableRuns(runs, genotypeFile, mapFile, threshold = 0.5) } \arguments{ \item{runs}{R object (dataframe) with results on detected runs} @@ -45,6 +45,8 @@ runs <- slidingRUNS.run(genotypeFile, mapFile, runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") -tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) +table <- tableRuns( + runs = runsDF, genotypeFile = genotypeFile, + mapFile = mapFile, threshold = 0.5) } diff --git a/detectRUNS/man/tableRunsCpp.Rd b/detectRUNS/man/tableRunsCpp.Rd deleted file mode 100644 index 7672a3a..0000000 --- a/detectRUNS/man/tableRunsCpp.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{tableRunsCpp} -\alias{tableRunsCpp} -\title{Function to retrieve most common runs in the population} -\usage{ -tableRunsCpp(runs, genotypeFile, mapFile, threshold = 0.5) -} -\arguments{ -\item{runs}{R object (dataframe) with results on detected runs} - -\item{genotypeFile}{Plink ped file (for SNP position)} - -\item{mapFile}{Plink map file (for SNP position)} - -\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired -proportion of individuals carrying that run (e.g. 70\%)} -} -\value{ -A dataframe with the most common runs detected in the sampled individuals -(the group/population, start and end position of the run, chromosome, number of SNP -included in the run and average percentage of SNPs in run -are reported in the output dataframe) -} -\description{ -This function takes in input either the run results and returns a subset of -the runs most commonly found in the group/population. The parameter -\code{threshold} controls the definition -of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) -} -\examples{ -# getting map and ped paths -genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") -mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") - -# calculating runs of Homozygosity -\dontrun{ -# skipping runs calculation -runs <- slidingRUNS.run(genotypeFile, mapFile, - windowSize = 15, threshold = 0.1, minSNP = 15, - ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 -) -} -# loading pre-calculated data -runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") -runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") - -table <- tableRunsCpp( - runs = runsDF, genotypeFile = genotypeFile, - mapFile = mapFile, threshold = 0.5) - -} diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index 862949d..e5c9485 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -159,9 +159,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// tableRunsCpp -DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); -RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { +// tableRuns +DataFrame tableRuns(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); +RcppExport SEXP _detectRUNS_tableRuns(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -169,7 +169,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); Rcpp::traits::input_parameter< std::string >::type mapFile(mapFileSEXP); Rcpp::traits::input_parameter< const float >::type threshold(thresholdSEXP); - rcpp_result_gen = Rcpp::wrap(tableRunsCpp(runs, genotypeFile, mapFile, threshold)); + rcpp_result_gen = Rcpp::wrap(tableRuns(runs, genotypeFile, mapFile, threshold)); return rcpp_result_gen; END_RCPP } @@ -186,7 +186,7 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, - {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4}, + {"_detectRUNS_tableRuns", (DL_FUNC) &_detectRUNS_tableRuns, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 554c6d9..6b2a020 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1174,7 +1174,7 @@ DataFrame filter_snpInsideRuns_by_breed( //' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") //' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") //' -//' table <- tableRunsCpp( +//' table <- tableRuns( //' runs = runsDF, genotypeFile = genotypeFile, //' mapFile = mapFile, threshold = 0.5) //' @@ -1182,7 +1182,7 @@ DataFrame filter_snpInsideRuns_by_breed( //' @importFrom Rcpp sourceCpp //' // [[Rcpp::export]] -DataFrame tableRunsCpp( +DataFrame tableRuns( DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold = 0.5) { diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index fe6a2b6..28d8475 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -14,23 +14,9 @@ test_that("Test tableRuns", { "integer", "integer", "numeric" ) reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) - levels(reference$Group) <- c("Jacobs", "Navajo-Churro") runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) expect_equal(reference, test) }) - -test_that("Test tableRunsCpp", { - colClasses <- c( - "factor", "character", "character", "character", "numeric", - "integer", "integer", "numeric" - ) - reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses) - - runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") - test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) - - expect_equal(reference, test) -}) From 0082c305aa4eb69125855c08cb1bd87f7b143986 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Thu, 21 Apr 2022 15:25:44 +0200 Subject: [PATCH 20/23] :zap: calculate pops outside snpInsideRunsCpp --- detectRUNS/DESCRIPTION | 4 ++-- detectRUNS/R/RcppExports.R | 6 +++--- detectRUNS/R/Stats.R | 4 +++- detectRUNS/R/plots.R | 7 +++++-- detectRUNS/man/snpInsideRunsCpp.Rd | 4 ++-- detectRUNS/src/RcppExports.cpp | 8 ++++---- detectRUNS/src/functions.cpp | 12 ++++++++---- detectRUNS/tests/testthat/test_functions.R | 8 ++++++-- 8 files changed, 33 insertions(+), 20 deletions(-) diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index efcfced..90d49ef 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,8 +2,8 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9003 -Date: 2022-04-05 +Version: 0.9.6.9004 +Date: 2022-04-21 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index ecdf88d..790e01d 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN #' #' @param runsChrom R object (dataframe) with results per chromosome #' @param mapChrom R map object with SNP per chromosome -#' @param genotypeFile genotype (.ped) file location +#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}} #' #' @return dataframe with counts per SNP in runs (per population) #' @@ -192,8 +192,8 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN #' @useDynLib detectRUNS #' @importFrom Rcpp sourceCpp #' -snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { - .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) +snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) { + .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops) } #' Function to retrieve most common runs in the population diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index 343df49..49a9ba9 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -378,7 +378,9 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ for (chrom in sort(unique(runs$CHROMOSOME))) { runsChrom <- runs[runs$CHROMOSOME==chrom,] mapKrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) + + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops) # remove Number column snpInRuns$Number <- NULL diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R index 0b865c4..9b9e03c 100644 --- a/detectRUNS/R/plots.R +++ b/detectRUNS/R/plots.R @@ -322,7 +322,8 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa mapChrom <- mappa[mappa$CHR==chromosome,] print(paste("N.of SNP is",nrow(mapChrom))) - snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, pops) # remove Number column snpInRuns$Number <- NULL @@ -436,7 +437,9 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33, for (chrom in sort(unique(runs$CHROMOSOME))) { runsChrom <- runs[runs$CHROMOSOME==chrom,] mapChrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile) + + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, pops) # remove Number column snpInRuns$Number <- NULL diff --git a/detectRUNS/man/snpInsideRunsCpp.Rd b/detectRUNS/man/snpInsideRunsCpp.Rd index 6e03a4d..e87b10f 100644 --- a/detectRUNS/man/snpInsideRunsCpp.Rd +++ b/detectRUNS/man/snpInsideRunsCpp.Rd @@ -4,14 +4,14 @@ \alias{snpInsideRunsCpp} \title{Function to count number of times a SNP is in a RUN} \usage{ -snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) +snpInsideRunsCpp(runsChrom, mapChrom, pops) } \arguments{ \item{runsChrom}{R object (dataframe) with results per chromosome} \item{mapChrom}{R map object with SNP per chromosome} -\item{genotypeFile}{genotype (.ped) file location} +\item{pops}{R object (dataframe) with populations see \code{\link{readPOPCpp}}} } \value{ dataframe with counts per SNP in runs (per population) diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index e5c9485..d56ba46 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -147,15 +147,15 @@ BEGIN_RCPP END_RCPP } // snpInsideRunsCpp -DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile); -RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) { +DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, DataFrame pops); +RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP popsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP); Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP); - Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); - rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)); + Rcpp::traits::input_parameter< DataFrame >::type pops(popsSEXP); + rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, pops)); return rcpp_result_gen; END_RCPP } diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 6b2a020..0a83467 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -908,7 +908,7 @@ void Runs::dumpRuns() { //' //' @param runsChrom R object (dataframe) with results per chromosome //' @param mapChrom R map object with SNP per chromosome -//' @param genotypeFile genotype (.ped) file location +//' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}} //' //' @return dataframe with counts per SNP in runs (per population) //' @@ -919,7 +919,7 @@ void Runs::dumpRuns() { //' // [[Rcpp::export]] DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, - std::string genotypeFile) { + DataFrame pops) { // get columns as vectors IntegerVector POSITIONS = mapChrom["POSITION"]; @@ -951,7 +951,6 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, IntegerVector number(result_size); // get all populations - DataFrame pops = readPOPCpp(genotypeFile); CharacterVector pop = pops["POP"]; // instantiate a Runs object @@ -1199,6 +1198,11 @@ DataFrame tableRuns( // read mapfile using R function (which uses data.table::fread) DataFrame mappa = readMapFile(mapFile); + // read populations from genotypeFile + Rprintf("Read populations from '%s'...\n", genotypeFile.c_str()); + DataFrame pops = readPOPCpp(genotypeFile); + Rprintf("Done!\n"); + // get distinct chromosomes CharacterVector chromosomes = runs[2]; CharacterVector unique_chromosomes = sort_unique(chromosomes); @@ -1239,7 +1243,7 @@ DataFrame tableRuns( } // calculate snpInsideRuns - DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile); + DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, pops); if (snpInsideRuns.nrows() == 0) { REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str()); diff --git a/detectRUNS/tests/testthat/test_functions.R b/detectRUNS/tests/testthat/test_functions.R index 40833b4..6e066bb 100644 --- a/detectRUNS/tests/testthat/test_functions.R +++ b/detectRUNS/tests/testthat/test_functions.R @@ -866,7 +866,9 @@ test_that("Testing snpInsideRuns", { # get snps inside runs reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile) - test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + + pops <- readPOPCpp(genotypeFile) + test <- snpInsideRunsCpp(runsChrom, mapChrom, pops) # remove Number column from test test$Number <- NULL @@ -906,7 +908,9 @@ test_that("Testing snpInsideRuns with CHR as strings", { # get snps inside runs reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile) - test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + + pops <- readPOPCpp(genotypeFile) + test <- snpInsideRunsCpp(runsChrom, mapChrom, pops) # remove Number column from test test$Number <- NULL From 52abad1fd8731b121385e618cc19279d1f77cd61 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Thu, 21 Apr 2022 15:47:59 +0200 Subject: [PATCH 21/23] :poop: attempt to fix performance scripts - not working --- performance/test-snpInsideRunsCpp.R | 12 +++++++++++- performance/test1.R | 20 ++++++++++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/performance/test-snpInsideRunsCpp.R b/performance/test-snpInsideRunsCpp.R index a59ec19..ea9555a 100644 --- a/performance/test-snpInsideRunsCpp.R +++ b/performance/test-snpInsideRunsCpp.R @@ -1,4 +1,9 @@ +library(devtools) +library(here) + +load_all(path = here("detectRUNS/")) + # get files path mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package="detectRUNS") genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS") @@ -20,7 +25,12 @@ runs <- readExternalRuns(runsfile, program="detectRUNS") names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") runsChrom <- runs[runs$CHROMOSOME==chrom, ] +# loading pops +pops <- readPOPCpp(genotypeFile) + +message("testing snpInsideRunsCpp...") + # get snps inside runs for (i in 1:100000) { - test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + test <- snpInsideRunsCpp(runsChrom, mapChrom, pops) } diff --git a/performance/test1.R b/performance/test1.R index 5bddf16..4e8ceda 100644 --- a/performance/test1.R +++ b/performance/test1.R @@ -1,6 +1,11 @@ # Valuating detectRUNS performance +library(devtools) +library(here) + +load_all(path = here("detectRUNS/")) + # clean up rm(list = ls()) @@ -75,8 +80,8 @@ steps <- ceiling(seq(1, length(x), length.out = (x_points+1) ))[-1] # calculating runs of Homozygosity runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, - minSNP = 15, ROHet = FALSE, maxOppositeGenotype = 1, - maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000) + minSNP = 15, ROHet = FALSE, maxOppWindow = 1, + maxMissWindow = 1, minLengthBps = 100000, minDensity = 1/10000) # fix column names names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") @@ -100,11 +105,11 @@ for (i in steps) { message(paste("Test sliding window: step", i)) # calculate sliding window - y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, + y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss) test_sliding <- microbenchmark( - slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, + slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss), unit = 'ms', times = times @@ -118,7 +123,7 @@ for (i in steps) { # check cpp slidingWindow test_slidingCpp <- microbenchmark( - slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, + slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss), unit = 'ms', times = times @@ -256,9 +261,12 @@ for (i in steps) { tmp <- data.frame(fun=test_fun, step=test_step, time=test_snpInsideRuns$time, language=test_language) tests <- rbind(tests, tmp) + # read pops + pops <- readPOPCpp(genotypeFile) + # check cpp snpInsideRuns test_snpInsideRunsCpp <- microbenchmark( - snpInsideRunsCpp(runs, mappa, genotypeFile), + snpInsideRunsCpp(runs, mappa, pops), unit = 'ms', times = times ) From 496642c51a5a95c52d810446e9e9760a5964c8b0 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Fri, 22 Apr 2022 11:39:21 +0200 Subject: [PATCH 22/23] :bug: fixed test1 performance test script --- performance/test1.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/performance/test1.R b/performance/test1.R index 4e8ceda..1ba0bf1 100644 --- a/performance/test1.R +++ b/performance/test1.R @@ -25,12 +25,14 @@ parameters <- list( minSNP=5, ROHet=TRUE, maxOppositeGenotype=1, + maxOppWindow=1, maxMiss=1, maxGap=10^6, minLengthBps=1000, minDensity=1/10, - maxOppRun=NULL, - maxMissRun=NULL + maxOppRun=1, + maxMissRun=1, + maxMissWindow=1 ) # how many times perform test @@ -57,7 +59,7 @@ animals <- readPOPCpp(genotypeFile = genotypeFile) # get only one individual. Get index # idx <- 1 -idx <- which(animals$ID=="H70") +idx <- which(animals$ID=="H114") # get an animal animal <- animals[idx, ] @@ -105,11 +107,11 @@ for (i in steps) { message(paste("Test sliding window: step", i)) # calculate sliding window - y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, + y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss) test_sliding <- microbenchmark( - slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, + slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss), unit = 'ms', times = times @@ -123,7 +125,7 @@ for (i in steps) { # check cpp slidingWindow test_slidingCpp <- microbenchmark( - slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap, + slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap, parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss), unit = 'ms', times = times From 856c488b5da4e9a6fbbb3dcf56be8d1dadd0dbc1 Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Wed, 27 Apr 2022 13:30:21 +0200 Subject: [PATCH 23/23] :bug: check threshold value in tableRun --- detectRUNS/R/RcppExports.R | 4 ++-- detectRUNS/man/tableRuns.Rd | 4 ++-- detectRUNS/src/functions.cpp | 9 +++++++-- detectRUNS/tests/testthat/test_Stats.R | 5 +++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index 790e01d..6af6306 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -206,8 +206,8 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) { #' @param genotypeFile Plink ped file (for SNP position) #' @param mapFile Plink map file (for SNP position) #' @param runs R object (dataframe) with results on detected runs -#' @param threshold value from 0 to 1 (default 0.7) that controls the desired -#' proportion of individuals carrying that run (e.g. 70\%) +#' @param threshold value from 0 to 1 (default 0.5) that controls the desired +#' proportion of individuals carrying that run (e.g. 50\%) #' #' @return A dataframe with the most common runs detected in the sampled individuals #' (the group/population, start and end position of the run, chromosome, number of SNP diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 8c8bdb6..27f60aa 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -13,8 +13,8 @@ tableRuns(runs, genotypeFile, mapFile, threshold = 0.5) \item{mapFile}{Plink map file (for SNP position)} -\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired -proportion of individuals carrying that run (e.g. 70\%)} +\item{threshold}{value from 0 to 1 (default 0.5) that controls the desired +proportion of individuals carrying that run (e.g. 50\%)} } \value{ A dataframe with the most common runs detected in the sampled individuals diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 0a83467..73005fa 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -1147,8 +1147,8 @@ DataFrame filter_snpInsideRuns_by_breed( //' @param genotypeFile Plink ped file (for SNP position) //' @param mapFile Plink map file (for SNP position) //' @param runs R object (dataframe) with results on detected runs -//' @param threshold value from 0 to 1 (default 0.7) that controls the desired -//' proportion of individuals carrying that run (e.g. 70\%) +//' @param threshold value from 0 to 1 (default 0.5) that controls the desired +//' proportion of individuals carrying that run (e.g. 50\%) //' //' @return A dataframe with the most common runs detected in the sampled individuals //' (the group/population, start and end position of the run, chromosome, number of SNP @@ -1185,6 +1185,11 @@ DataFrame tableRuns( DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold = 0.5) { + // check for threshold value + if (threshold > 1 || threshold < 0) { + throw std::range_error("Threshold must be between 0 and 1"); + } + // set a threshold float threshold_used = threshold * 100; Rprintf("Threshold used: %f\n", threshold_used); diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R index 28d8475..7b5d321 100644 --- a/detectRUNS/tests/testthat/test_Stats.R +++ b/detectRUNS/tests/testthat/test_Stats.R @@ -19,4 +19,9 @@ test_that("Test tableRuns", { test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) expect_equal(reference, test) + + expect_error( + tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 50), + "Threshold must be between 0 and 1" + ) })