diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 4727d86..8b0479d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -29,6 +29,5 @@ jobs: working-directory: ./detectRUNS - name: Test coverage - run: covr::codecov() + run: covr::codecov(path="detectRUNS/") shell: Rscript {0} - working-directory: ./detectRUNS diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION index e30e09c..baeefe7 100644 --- a/detectRUNS/DESCRIPTION +++ b/detectRUNS/DESCRIPTION @@ -2,11 +2,11 @@ Package: detectRUNS Type: Package Title: Detect Runs of Homozygosity and Runs of Heterozygosity in Diploid Genomes -Version: 0.9.6.9000 -Date: 2020-12-20 +Version: 0.9.6.9004 +Date: 2022-04-21 Authors@R: c( person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")), - person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"), + person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"), person("Giustino","Gaspa", email="gigaspa@uniss.it", role="aut"), person("Gabriele","Marras", email="gmarras@uoguelph.ca", role="aut") ) diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R index b67a235..6af6306 100644 --- a/detectRUNS/R/RcppExports.R +++ b/detectRUNS/R/RcppExports.R @@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN #' #' @param runsChrom R object (dataframe) with results per chromosome #' @param mapChrom R map object with SNP per chromosome -#' @param genotypeFile genotype (.ped) file location +#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}} #' #' @return dataframe with counts per SNP in runs (per population) #' @@ -192,7 +192,54 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN #' @useDynLib detectRUNS #' @importFrom Rcpp sourceCpp #' -snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) { - .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile) +snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) { + .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops) +} + +#' Function to retrieve most common runs in the population +#' +#' This function takes in input either the run results and returns a subset of +#' the runs most commonly found in the group/population. The parameter +#' \code{threshold} controls the definition +#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) +#' +#' @param genotypeFile Plink ped file (for SNP position) +#' @param mapFile Plink map file (for SNP position) +#' @param runs R object (dataframe) with results on detected runs +#' @param threshold value from 0 to 1 (default 0.5) that controls the desired +#' proportion of individuals carrying that run (e.g. 50\%) +#' +#' @return A dataframe with the most common runs detected in the sampled individuals +#' (the group/population, start and end position of the run, chromosome, number of SNP +#' included in the run and average percentage of SNPs in run +#' are reported in the output dataframe) +#' @export +#' +#' @examples +#' # getting map and ped paths +#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") +#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") +#' +#' # calculating runs of Homozygosity +#' \dontrun{ +#' # skipping runs calculation +#' runs <- slidingRUNS.run(genotypeFile, mapFile, +#' windowSize = 15, threshold = 0.1, minSNP = 15, +#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +#' ) +#' } +#' # loading pre-calculated data +#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") +#' +#' table <- tableRuns( +#' runs = runsDF, genotypeFile = genotypeFile, +#' mapFile = mapFile, threshold = 0.5) +#' +#' @useDynLib detectRUNS +#' @importFrom Rcpp sourceCpp +#' +tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) { + .Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold) } diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R index c1d7cf3..49a9ba9 100644 --- a/detectRUNS/R/Stats.R +++ b/detectRUNS/R/Stats.R @@ -17,7 +17,7 @@ chromosomeLength <- function(mapFile){ # read mapfile mappa <- readMapFile(mapFile) - + maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0 # defining NULL variables to avoid warning messages @@ -355,7 +355,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ message("Calculating SNPs inside ROH") names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") - + #read map file mappa <- readMapFile(mapFile) @@ -378,184 +378,24 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){ for (chrom in sort(unique(runs$CHROMOSOME))) { runsChrom <- runs[runs$CHROMOSOME==chrom,] mapKrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) - all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) - n=n+1 - setTxtProgressBar(pb, n) - } - close(pb) - - result_summary=append(result_summary,list(SNPinRun = all_SNPinROH)) - message("Calculation % SNP in ROH finish") #FILIPPO - } + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops) + # remove Number column + snpInRuns$Number <- NULL - return(result_summary) -} - - -#' Function to retrieve most common runs in the population -#' -#' This function takes in input either the run results or the output from -#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run) -#' in the population/group, and returns a subset of the runs most commonly -#' found in the group/population. The parameter \code{threshold} controls the definition -#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) -#' -#' @param genotypeFile Plink ped file (for SNP position) -#' @param mapFile Plink map file (for SNP position) -#' @param runs R object (dataframe) with results on detected runs -#' @param threshold value from 0 to 1 (default 0.7) that controls the desired -#' proportion of individuals carrying that run (e.g. 70\%) -#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a -#' run in the population (output from \code{snpInsideRuns}) -#' -#' @return A dataframe with the most common runs detected in the sampled individuals -#' (the group/population, start and end position of the run, chromosome, number of SNP -#' included in the run and average percentage of SNPs in run -#' are reported in the output dataframe) -#' @export -#' -#' @examples -#' # getting map and ped paths -#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS") -#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS") -#' -#' # calculating runs of Homozygosity -#' \dontrun{ -#' # skipping runs calculation -#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15, -#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000) -#' } -#' # loading pre-calculated data -#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") -#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS') -#' -#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) -#' - -tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) { - - #set a threshold - threshold_used=threshold*100 - message(paste('Threshold used:',threshold_used)) - - # read map file - mappa <- readMapFile(mapFile) - - if(!is.null(runs) & is.null(SnpInRuns)){ - message('I found only Runs data frame. GOOD!') - - #change colnames in runs file - names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH") - - #Start calculation % SNP in ROH - message("Calculation % SNP in ROH") #FILIPPO - all_SNPinROH <- data.frame("SNP_NAME"=character(), - "CHR"=integer(), - "POSITION"=numeric(), - "COUNT"=integer(), - "BREED"=factor(), - "PERCENTAGE"=numeric(), - stringsAsFactors=FALSE) - - # create progress bar - total <- length(unique(runs$CHROMOSOME)) - message(paste('Chromosome founds: ',total)) #FILIPPO - n=0 - pb <- txtProgressBar(min = 0, max = total, style = 3) - - #SNP in ROH - for (chrom in sort(unique(runs$CHROMOSOME))) { - runsChrom <- runs[runs$CHROMOSOME==chrom,] - mapKrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile) all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) n=n+1 setTxtProgressBar(pb, n) } close(pb) + + result_summary=append(result_summary,list(SNPinRun = all_SNPinROH)) message("Calculation % SNP in ROH finish") #FILIPPO - } else if (is.null(runs) & !is.null(SnpInRuns)) { - message('I found only SNPinRuns data frame. GOOD!') - all_SNPinROH=SnpInRuns - } else{ - stop('You gave me Runs and SNPinRuns! Please choose one!') } - #consecutive number - all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE)) - - #final data frame - final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0), - "chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0), "avg_pct"=numeric(0)) - - - #vector of breeds - group_list=as.vector(unique(all_SNPinROH$BREED)) - - for (grp in group_list){ - message(paste('checking: ',grp)) - - #create subset for group/thresold - group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,]) - - #print(group_subset) - - #variable - old_pos=group_subset[1,7] - snp_pos1=group_subset[1,3] - Start_SNP=group_subset[1,1] - snp_count=0 - sum_pct = 0 - - x=2 - while(x <= length(rownames(group_subset))) { - - snp_count = snp_count + 1 - new_pos=group_subset[x,7] - old_pos=group_subset[x-1,7] - chr_old=group_subset[x-1,2] - chr_new =group_subset[x,2] - sum_pct = sum_pct + group_subset[x-1,"PERCENTAGE"] - diff=new_pos-old_pos - if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) { - if (x==length(rownames(group_subset))){ - end_SNP=group_subset[x,1] - TO=group_subset[x,3] - }else{ - end_SNP=group_subset[x-1,1] - TO=group_subset[x-1,3] - } - - final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5], - "Start_SNP"=Start_SNP, - "End_SNP"=end_SNP, - "chrom"=group_subset[x-1,2], - "nSNP"=snp_count, - "from"=snp_pos1, - "to"=TO, - "avg_pct"=sum_pct)) - - #reset variable - snp_count=0 - sum_pct=0 - snp_pos1=group_subset[x,3] - Start_SNP=group_subset[x,1] - } - - #upgrade x value - x <- x+1 - - } - } - - final_table$avg_pct = final_table$avg_pct/final_table$nSNP - - rownames(final_table) <- seq(1,length(row.names(final_table))) - return(final_table) + return(result_summary) } - diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R index 66b648c..c985038 100644 --- a/detectRUNS/R/plots.R +++ b/detectRUNS/R/plots.R @@ -322,7 +322,12 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa mapChrom <- mappa[mappa$CHR==chromosome,] print(paste("N.of SNP is",nrow(mapChrom))) - snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, pops) + + # remove Number column + snpInRuns$Number <- NULL + krom <- subset(snpInRuns,CHR==chromosome) p <- ggplot(data=krom, aes(x=POSITION/(10^6), y=PERCENTAGE, colour=BREED)) @@ -432,7 +437,13 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33, for (chrom in sort(unique(runs$CHROMOSOME))) { runsChrom <- runs[runs$CHROMOSOME==chrom,] mapChrom <- mappa[mappa$CHR==chrom,] - snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile) + + pops <- readPOPCpp(genotypeFile) + snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, pops) + + # remove Number column + snpInRuns$Number <- NULL + all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns) n=n+1 setTxtProgressBar(pb, n) diff --git a/detectRUNS/man/snpInsideRunsCpp.Rd b/detectRUNS/man/snpInsideRunsCpp.Rd index 6e03a4d..e87b10f 100644 --- a/detectRUNS/man/snpInsideRunsCpp.Rd +++ b/detectRUNS/man/snpInsideRunsCpp.Rd @@ -4,14 +4,14 @@ \alias{snpInsideRunsCpp} \title{Function to count number of times a SNP is in a RUN} \usage{ -snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile) +snpInsideRunsCpp(runsChrom, mapChrom, pops) } \arguments{ \item{runsChrom}{R object (dataframe) with results per chromosome} \item{mapChrom}{R map object with SNP per chromosome} -\item{genotypeFile}{genotype (.ped) file location} +\item{pops}{R object (dataframe) with populations see \code{\link{readPOPCpp}}} } \value{ dataframe with counts per SNP in runs (per population) diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd index 1e2bf7d..27f60aa 100644 --- a/detectRUNS/man/tableRuns.Rd +++ b/detectRUNS/man/tableRuns.Rd @@ -1,29 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Stats.R +% Please edit documentation in R/RcppExports.R \name{tableRuns} \alias{tableRuns} \title{Function to retrieve most common runs in the population} \usage{ -tableRuns( - runs = NULL, - SnpInRuns = NULL, - genotypeFile, - mapFile, - threshold = 0.5 -) +tableRuns(runs, genotypeFile, mapFile, threshold = 0.5) } \arguments{ \item{runs}{R object (dataframe) with results on detected runs} -\item{SnpInRuns}{dataframe with the proportion of times each SNP falls inside a -run in the population (output from \code{snpInsideRuns})} - \item{genotypeFile}{Plink ped file (for SNP position)} \item{mapFile}{Plink map file (for SNP position)} -\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired -proportion of individuals carrying that run (e.g. 70\%)} +\item{threshold}{value from 0 to 1 (default 0.5) that controls the desired +proportion of individuals carrying that run (e.g. 50\%)} } \value{ A dataframe with the most common runs detected in the sampled individuals @@ -32,10 +23,9 @@ included in the run and average percentage of SNPs in run are reported in the output dataframe) } \description{ -This function takes in input either the run results or the output from -the function \code{snpInsideRuns} (proportion of times a SNP is inside a run) -in the population/group, and returns a subset of the runs most commonly -found in the group/population. The parameter \code{threshold} controls the definition +This function takes in input either the run results and returns a subset of +the runs most commonly found in the group/population. The parameter +\code{threshold} controls the definition of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals) } \examples{ @@ -46,13 +36,17 @@ mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detec # calculating runs of Homozygosity \dontrun{ # skipping runs calculation -runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15, -ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000) +runs <- slidingRUNS.run(genotypeFile, mapFile, + windowSize = 15, threshold = 0.1, minSNP = 15, + ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000 +) } # loading pre-calculated data -runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS") -runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS') +runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS") +runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS") -tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5) +table <- tableRuns( + runs = runsDF, genotypeFile = genotypeFile, + mapFile = mapFile, threshold = 0.5) } diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp index e4721e6..d56ba46 100644 --- a/detectRUNS/src/RcppExports.cpp +++ b/detectRUNS/src/RcppExports.cpp @@ -147,15 +147,29 @@ BEGIN_RCPP END_RCPP } // snpInsideRunsCpp -DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile); -RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) { +DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, DataFrame pops); +RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP popsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP); Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP); + Rcpp::traits::input_parameter< DataFrame >::type pops(popsSEXP); + rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, pops)); + return rcpp_result_gen; +END_RCPP +} +// tableRuns +DataFrame tableRuns(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold); +RcppExport SEXP _detectRUNS_tableRuns(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< DataFrame >::type runs(runsSEXP); Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP); - rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)); + Rcpp::traits::input_parameter< std::string >::type mapFile(mapFileSEXP); + Rcpp::traits::input_parameter< const float >::type threshold(thresholdSEXP); + rcpp_result_gen = Rcpp::wrap(tableRuns(runs, genotypeFile, mapFile, threshold)); return rcpp_result_gen; END_RCPP } @@ -172,6 +186,7 @@ static const R_CallMethodDef CallEntries[] = { {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1}, {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9}, {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3}, + {"_detectRUNS_tableRuns", (DL_FUNC) &_detectRUNS_tableRuns, 4}, {NULL, NULL, 0} }; diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp index 89b04ab..73005fa 100644 --- a/detectRUNS/src/functions.cpp +++ b/detectRUNS/src/functions.cpp @@ -908,7 +908,7 @@ void Runs::dumpRuns() { //' //' @param runsChrom R object (dataframe) with results per chromosome //' @param mapChrom R map object with SNP per chromosome -//' @param genotypeFile genotype (.ped) file location +//' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}} //' //' @return dataframe with counts per SNP in runs (per population) //' @@ -919,12 +919,12 @@ void Runs::dumpRuns() { //' // [[Rcpp::export]] DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, - std::string genotypeFile) { + DataFrame pops) { - // transform R object in Cpp object - std::vector POSITIONS = as >(mapChrom["POSITION"]); - std::vector SNP_NAME = as >(mapChrom["SNP_NAME"]); - std::vector CHR = as >(mapChrom["CHR"]); + // get columns as vectors + IntegerVector POSITIONS = mapChrom["POSITION"]; + CharacterVector SNP_NAME = mapChrom["SNP_NAME"]; + CharacterVector CHR = mapChrom["CHR"]; // get unique breeds CharacterVector population = runsChrom["POPULATION"]; @@ -948,9 +948,9 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, IntegerVector count(result_size); CharacterVector breed(result_size); NumericVector percentage(result_size); + IntegerVector number(result_size); // get all populations - DataFrame pops = readPOPCpp(genotypeFile); CharacterVector pop = pops["POP"]; // instantiate a Runs object @@ -988,6 +988,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, count[index] = snpCounts[ras]; breed[index] = ras; percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100; + number[index] = j+1; // debug // if (SNP_NAME[j] == "OAR24_6970428.1") { @@ -1006,10 +1007,380 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, // initialize dataframe of results. DataFrame res = DataFrame::create( - Named("SNP_NAME")=snp_name, Named("CHR")=chrom, Named("POSITION")=position, - Named("COUNT")=count, Named("BREED")=fast_factor(breed) , //returns a factor - Named("PERCENTAGE")=percentage, _["stringsAsFactors"] = false); + Named("SNP_NAME") = snp_name, + Named("CHR") = chrom, + Named("POSITION") = position, + Named("COUNT") = count, + Named("BREED") = fast_factor(breed), //returns a factor + Named("PERCENTAGE") = percentage, + Named("Number") = number, + _["stringsAsFactors"] = false); // returning all runs for this individual genotype return(res); } + + +DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){ + CharacterVector population = runs[0]; + CharacterVector ind = runs[1]; + CharacterVector chromosomes = runs[2]; + IntegerVector count = runs[3]; + IntegerVector start = runs[4]; + IntegerVector end = runs[5]; + IntegerVector length = runs[6]; + + LogicalVector chrom_idx(chromosomes.size()); + + // Rprintf("Got %d runs\n", runs.nrows()); + + for (unsigned int j=0; j(chromosomes[j]) == chrom); + } + + DataFrame res = DataFrame::create( + Named("CHR") = chromosomes[chrom_idx], + Named("SNP_NAME") = snps[chrom_idx], + Named("POSITION") = positions[chrom_idx] + ); + + // Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str()); + return res; +} + + +DataFrame filter_snpInsideRuns_by_threshold( + DataFrame snpInsideRuns, float threshold){ + CharacterVector snps = snpInsideRuns["SNP_NAME"]; + CharacterVector chromosomes = snpInsideRuns["CHR"]; + IntegerVector positions = snpInsideRuns["POSITION"]; + IntegerVector counts = snpInsideRuns["COUNT"]; + CharacterVector breeds = snpInsideRuns["BREED"]; + NumericVector percentages = snpInsideRuns["PERCENTAGE"]; + IntegerVector numbers = snpInsideRuns["Number"]; + + LogicalVector idx(percentages.size()); + + // Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows()); + + for (unsigned int j=0; j= threshold); + } + + DataFrame res = DataFrame::create( + Named("SNP_NAME") = snps[idx], + Named("CHR") = chromosomes[idx], + Named("POSITION") = positions[idx], + Named("COUNT") = counts[idx], + Named("BREED") = fast_factor(breeds[idx]), //returns a factor + Named("PERCENTAGE") = percentages[idx], + Named("Number") = numbers[idx], + _["stringsAsFactors"] = false); + + // Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold); + return res; +} + + +DataFrame filter_snpInsideRuns_by_breed( + DataFrame snpInsideRuns, std::string breed){ + CharacterVector snps = snpInsideRuns["SNP_NAME"]; + CharacterVector chromosomes = snpInsideRuns["CHR"]; + IntegerVector positions = snpInsideRuns["POSITION"]; + IntegerVector counts = snpInsideRuns["COUNT"]; + CharacterVector breeds = snpInsideRuns["BREED"]; + NumericVector percentages = snpInsideRuns["PERCENTAGE"]; + IntegerVector numbers = snpInsideRuns["Number"]; + + LogicalVector idx(breeds.size()); + + for (unsigned int j=0; j 1 || threshold < 0) { + throw std::range_error("Threshold must be between 0 and 1"); + } + + // set a threshold + float threshold_used = threshold * 100; + Rprintf("Threshold used: %f\n", threshold_used); + + // Obtaining namespace of detectRUNS package + Environment pkg = Environment::namespace_env("detectRUNS"); + + // Picking up readMapFile function from detectRUNS package + Function readMapFile = pkg["readMapFile"]; + + // read mapfile using R function (which uses data.table::fread) + DataFrame mappa = readMapFile(mapFile); + + // read populations from genotypeFile + Rprintf("Read populations from '%s'...\n", genotypeFile.c_str()); + DataFrame pops = readPOPCpp(genotypeFile); + Rprintf("Done!\n"); + + // get distinct chromosomes + CharacterVector chromosomes = runs[2]; + CharacterVector unique_chromosomes = sort_unique(chromosomes); + + // These will be the output vectors + CharacterVector res_group; + CharacterVector res_start_snp; + CharacterVector res_end_snp; + CharacterVector res_chrom; + IntegerVector res_n_snp; + IntegerVector res_from; + IntegerVector res_to; + NumericVector res_avg_pct; + + Rcout << "Calculation % SNP in ROH\n"; + + for (unsigned int i=0; i(unique_chromosomes[i]); + + Rprintf( + "Processing chromosome '%s' (%d/%d)\n", + chrom.c_str(), + i+1, unique_chromosomes.size()); + + // extract the desired chrom + DataFrame runsChrom = subset_runs_by_chrom(runs, chrom); + + if (runsChrom.nrows() == 0) { + REprintf("No runs on chromosome %s\n", chrom.c_str()); + continue; + } + + DataFrame mapKrom = subset_map_by_chrom(mappa, chrom); + + if (mapKrom.nrows() == 0) { + REprintf("No SNPs data for chromosome %s\n", chrom.c_str()); + continue; + } + + // calculate snpInsideRuns + DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, pops); + + if (snpInsideRuns.nrows() == 0) { + REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str()); + continue; + } + + // now filter snpInsideRuns relying on threshold + snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used); + + if (snpInsideRuns.nrows() == 0) { + REprintf( + "No SNPs in runs after filtering %f for chromosome %s\n", + threshold_used, chrom.c_str()); + continue; + } + + // collect the distinct breeds + CharacterVector breeds = snpInsideRuns["BREED"]; + CharacterVector group_list = unique(breeds); + + for (unsigned int j=0; j(group_list[j]); + + // Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str()); + + DataFrame group_subset = filter_snpInsideRuns_by_breed(snpInsideRuns, grp); + + // after filtering, I need to have at least 2 rows or I can't do the following stuff + if (group_subset.nrows() < 2) { + continue; + } + + // initialize stuff + IntegerVector idxs = group_subset["Number"]; + IntegerVector positions = group_subset["POSITION"]; + CharacterVector snp_names = group_subset["SNP_NAME"]; + NumericVector sum_pcts = group_subset["PERCENTAGE"]; + + int old_idx = idxs[0]; + int from = positions[0]; + String Start_SNP = snp_names[0]; + int snp_count = 1; + double sum_pct = sum_pcts[0]; + + // Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from); + // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count); + // Rprintf("sum_pct: %f\n", sum_pct); + + for (int x=1; x(snp_names[x]).c_str()); + // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + + // get current index + int new_idx = idxs[x]; + + // Difference between indexes. This will be 1 in case of adjacent rows + int diff = new_idx - old_idx; + + // declare other variables + String end_SNP; + int TO; + + if ((diff > 1) | (x == group_subset.nrows()-1)) { + if (x == group_subset.nrows()-1) { + // Rcout << "End of subset\n"; + end_SNP = snp_names[x]; + TO = positions[x]; + } else { + // Rcout << "End of segment\n"; + end_SNP = snp_names[x-1]; + TO = positions[x-1]; + } + + // throw away row if composed by only one SNP + if (snp_count > 1) { + res_group.push_back(grp); + res_start_snp.push_back(Start_SNP); + res_end_snp.push_back(end_SNP); + res_chrom.push_back(chrom); + res_n_snp.push_back(snp_count); + res_from.push_back(from); + res_to.push_back(TO); + res_avg_pct.push_back(sum_pct); + + // Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring()); + // Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str()); + // Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO); + // Rprintf("sum_pct: %d\n", sum_pct); + } + + // reset variable + // Rprintf("Start a new segment: %s, ", as(snp_names[x]).c_str()); + // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]); + + snp_count = 1; + sum_pct = sum_pcts[x]; + from = positions[x]; + Start_SNP = snp_names[x]; + } else { + // update stuff + snp_count++; + sum_pct += sum_pcts[x]; + + // Rprintf("Add %s to group ", as(snp_names[x]).c_str()); + // Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct); + } + + // update index + old_idx = new_idx; + } // cycle for snpInsideRuns + + } // cycle for group (breed) + + } // cycle for chromosome + + // calculate the avg percentage + for (unsigned int i=0; i