bioinformatics-ptp · filippob · Jan 5, 2022 · Jan 5, 2022 · Mar 22, 2022 · Mar 28, 2022
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -29,6 +29,5 @@ jobs:
           working-directory: ./detectRUNS
 
       - name: Test coverage
-        run: covr::codecov()
+        run: covr::codecov(path="detectRUNS/")
         shell: Rscript {0}
-        working-directory: ./detectRUNS
diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
@@ -2,11 +2,11 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9000
-Date: 2020-12-20
+Version: 0.9.6.9004
+Date: 2022-04-21
 Authors@R: c(
     person("Filippo","Biscarini", email="[email protected]", role=c("aut", "cre")),
-    person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"),
+    person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
     person("Giustino","Gaspa", email="[email protected]", role="aut"),
     person("Gabriele","Marras", email="[email protected]", role="aut")
   )
@@ -28,7 +28,7 @@ Imports:
     Rcpp,
     gridExtra,
     data.table
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
 Suggests:
     testthat,
     knitr,

diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
@@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
 #'
 #' @param runsChrom R object (dataframe) with results per chromosome
 #' @param mapChrom R map object with SNP per chromosome
-#' @param genotypeFile genotype (.ped) file location
+#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
 #'
 #' @return dataframe with counts per SNP in runs (per population)
 #'
@@ -192,7 +192,54 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
 #' @useDynLib detectRUNS
 #' @importFrom Rcpp sourceCpp
 #'
-snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
-    .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
+snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
+    .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops)
+}
+
+#' Function to retrieve most common runs in the population
+#'
+#' This function takes in input either the run results and returns a subset of
+#' the runs most commonly found in the group/population. The parameter
+#' \code{threshold} controls the definition
+#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
+#'
+#' @param genotypeFile Plink ped file (for SNP position)
+#' @param mapFile Plink map file (for SNP position)
+#' @param runs R object (dataframe) with results on detected runs
+#' @param threshold value from 0 to 1 (default 0.5) that controls the desired
+#' proportion of individuals carrying that run (e.g. 50\%)
+#'
+#' @return A dataframe with the most common runs detected in the sampled individuals
+#' (the group/population, start and end position of the run, chromosome, number of SNP
+#' included in the run and average percentage of SNPs in run
+#' are reported in the output dataframe)
+#' @export
+#'
+#' @examples
+#' # getting map and ped paths
+#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
+#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
+#'
+#' # calculating runs of Homozygosity
+#' \dontrun{
+#' # skipping runs calculation
+#' runs <- slidingRUNS.run(genotypeFile, mapFile,
+#'   windowSize = 15, threshold = 0.1, minSNP = 15,
+#'   ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+#' )
+#' }
+#' # loading pre-calculated data
+#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
+#'
+#' table <- tableRuns(
+#'   runs = runsDF, genotypeFile = genotypeFile,
+#'   mapFile = mapFile, threshold = 0.5)
+#'
+#' @useDynLib detectRUNS
+#' @importFrom Rcpp sourceCpp
+#'
+tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
+    .Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
 }
 
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
@@ -17,7 +17,7 @@
 chromosomeLength <- function(mapFile){
   # read mapfile
   mappa <- readMapFile(mapFile)
-  
+
   maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0
 
   # defining NULL variables to avoid warning messages
@@ -243,7 +243,16 @@ Froh_inbreedingClass <- function(runs, mapFile, Class=2){
 #'
 #' @return A list of dataframes containing the most relevant descriptives
 #' statistics on detected runs. The list conveniently contains 9 dataframes that can
-#' be used for further processing and visualization, or can be written out to text files
+#' be used for further processing and visualization, or can be written out to text files:
+#' 1) summary_ROH_count_chr: n. of runs per chromosome and breed/group;
+#' 2) summary_ROH_percentage_chr: percent distribution of runs per chromosome in each breed/group (sum to 1)
+#' 3) summary_ROH_count: n. of runs per size-class (Mb) in each breed/group
+#' 4) summary_ROH_percentage: percent distribution of runs per size-class (Mb) in each breed/group (sum to 1)
+#' 5) summary_ROH_mean_chr: average size of runs (Mb) per chromosome and breed/group
+#' 6) summary_ROH_mean_class: average size of runs (Mb) per size-class (Mb) in each breed/group
+#' 7) result_Froh_genome_wide: genome-wide inbreeding ($F_{ROH}$) for each individual in the dataset
+#' 8) result_Froh_chromosome_wide inbreeding ($F_{ROH}$) for each individual (and chromosome) in the dataset
+#' 9) result_Froh_class: genome-wide inbreeding ($F_{ROH}$) for each individual in the dataset per size-class (Mb) of runs
 #' @export
 #'
 #' @examples
@@ -355,7 +364,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 
     message("Calculating SNPs inside ROH")
     names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
-    
+
     #read map file
     mappa <- readMapFile(mapFile)
 
@@ -378,177 +387,24 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
     for (chrom in sort(unique(runs$CHROMOSOME))) {
       runsChrom <- runs[runs$CHROMOSOME==chrom,]
       mapKrom <- mappa[mappa$CHR==chrom,]
-      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
-      all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
-      n=n+1
-      setTxtProgressBar(pb, n)
-    }
-    close(pb)
-
-    result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
-    message("Calculation % SNP in ROH finish") #FILIPPO
-  }
-
-
-
-  return(result_summary)
-}
-
-
-#' Function to retrieve most common runs in the population
-#'
-#' This function takes in input either the run results or the output from
-#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
-#' in the population/group, and returns a subset of the runs most commonly
-#' found in the group/population. The parameter \code{threshold} controls the definition
-#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
-#'
-#' @param genotypeFile Plink ped file (for SNP position)
-#' @param mapFile Plink map file (for SNP position)
-#' @param runs R object (dataframe) with results on detected runs
-#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
-#' proportion of individuals carrying that run (e.g. 70\%)
-#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a
-#' run in the population (output from \code{snpInsideRuns})
-#'
-#' @return A dataframe with the most common runs detected in the sampled individuals
-#' (the group/population, start and end position of the run, chromosome and number of SNP
-#' included in the run are reported in the output dataframe)
-#' @export
-#'
-#' @examples
-#' # getting map and ped paths
-#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
-#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
-#'
-#' # calculating runs of Homozygosity
-#' \dontrun{
-#' # skipping runs calculation
-#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1,  minSNP = 15,
-#' ROHet = FALSE,  maxOppositeGenotype = 1, maxMiss = 1,  minLengthBps = 100000,  minDensity = 1/10000)
-#' }
-#' # loading pre-calculated data
-#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
-#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
-#'
-#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
-#'
-
-tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) {
-
-  #set a threshold
-  threshold_used=threshold*100
-  message(paste('Threshold used:',threshold_used))
 
-  # read map file
-  mappa <- readMapFile(mapFile)
-
-  if(!is.null(runs) & is.null(SnpInRuns)){
-    message('I found only Runs data frame. GOOD!')
+      pops <- readPOPCpp(genotypeFile)
+      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops)
 
-    #change colnames in runs file
-    names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
+      # remove Number column
+      snpInRuns$Number <- NULL
 
-    #Start calculation % SNP in ROH
-    message("Calculation % SNP in ROH") #FILIPPO
-    all_SNPinROH <- data.frame("SNP_NAME"=character(),
-                               "CHR"=integer(),
-                               "POSITION"=numeric(),
-                               "COUNT"=integer(),
-                               "BREED"=factor(),
-                               "PERCENTAGE"=numeric(),
-                               stringsAsFactors=FALSE)
-
-    # create progress bar
-    total <- length(unique(runs$CHROMOSOME))
-    message(paste('Chromosome founds: ',total)) #FILIPPO
-    n=0
-    pb <- txtProgressBar(min = 0, max = total, style = 3)
-
-    #SNP in ROH
-    for (chrom in sort(unique(runs$CHROMOSOME))) {
-      runsChrom <- runs[runs$CHROMOSOME==chrom,]
-      mapKrom <- mappa[mappa$CHR==chrom,]
-      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
       all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
       n=n+1
       setTxtProgressBar(pb, n)
     }
     close(pb)
+
+    result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
     message("Calculation % SNP in ROH finish") #FILIPPO
-  } else if (is.null(runs) & !is.null(SnpInRuns)) {
-    message('I found only SNPinRuns data frame. GOOD!')
-    all_SNPinROH=SnpInRuns
-  } else{
-    stop('You gave me Runs and SNPinRuns! Please choose one!')
   }
 
-  #consecutive number
-  all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))
-
-  #final data frame
-  final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0),
-                            "chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0))
-
-
-  #vector of breeds
-  group_list=as.vector(unique(all_SNPinROH$BREED))
-
-  for (grp in group_list){
-    message(paste('checking: ',grp))
-
-    #create subset for group/thresold
-    group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,])
-
-    #print(group_subset)
-
-    #variable
-    old_pos=group_subset[1,7]
-    snp_pos1=group_subset[1,3]
-    Start_SNP=group_subset[1,1]
-    snp_count=0
-
-    x=2
-    while(x <= length(rownames(group_subset))) {
 
-      snp_count = snp_count + 1
-      new_pos=group_subset[x,7]
-      old_pos=group_subset[x-1,7]
-      chr_old=group_subset[x-1,2]
-      chr_new =group_subset[x,2]
 
-      diff=new_pos-old_pos
-
-      if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) {
-        if (x==length(rownames(group_subset))){
-          end_SNP=group_subset[x,1]
-          TO=group_subset[x,3]
-        }else{
-          end_SNP=group_subset[x-1,1]
-          TO=group_subset[x-1,3]
-        }
-
-        final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5],
-                                                                           "Start_SNP"=Start_SNP,
-                                                                           "End_SNP"=end_SNP,
-                                                                           "chrom"=group_subset[x-1,2],
-                                                                           "nSNP"=snp_count,
-                                                                           "from"=snp_pos1,
-                                                                           "to"=TO))
-
-        #reset variable
-        snp_count=0
-        snp_pos1=group_subset[x,3]
-        Start_SNP=group_subset[x,1]
-      }
-
-      #upgrade x value
-      x <- x+1
-
-    }
-  }
-
-  rownames(final_table) <- seq(1,length(row.names(final_table)))
-  return(final_table)
+  return(result_summary)
 }
-