From 7c6bd2d10f36e284c6cfecb53a08cbe4cadf42f5 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 28 Mar 2022 19:06:35 +0200
Subject: [PATCH 01/23] :white_check_mark: test tableRuns

---
 detectRUNS/DESCRIPTION                       |  6 +++---
 detectRUNS/man/tableRuns.Rd                  |  5 +++--
 detectRUNS/src/RcppExports.cpp               |  5 +++++
 detectRUNS/tests/testthat/test.tableRuns.csv |  4 ++++
 detectRUNS/tests/testthat/test_Stats.R       | 21 ++++++++++++++++++++
 5 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 detectRUNS/tests/testthat/test.tableRuns.csv
 create mode 100644 detectRUNS/tests/testthat/test_Stats.R
diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index a4e7595..56220c4 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,11 +2,11 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9000
+Version: 0.9.6.9001
 Date: 2020-12-20
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
-    person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"),
+    person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
     person("Giustino","Gaspa", email="gigaspa@uniss.it", role="aut"),
     person("Gabriele","Marras", email="gmarras@uoguelph.ca", role="aut")
   )
@@ -28,7 +28,7 @@ Imports:
     Rcpp,
     gridExtra,
     data.table
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Suggests:
     testthat,
     knitr,
diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd
index 83c2da8..1e2bf7d 100644
--- a/detectRUNS/man/tableRuns.Rd
+++ b/detectRUNS/man/tableRuns.Rd
@@ -27,8 +27,9 @@ proportion of individuals carrying that run (e.g. 70\%)}
 }
 \value{
 A dataframe with the most common runs detected in the sampled individuals
-(the group/population, start and end position of the run, chromosome and number of SNP
-included in the run are reported in the output dataframe)
+(the group/population, start and end position of the run, chromosome, number of SNP
+included in the run and average percentage of SNPs in run
+are reported in the output dataframe)
 }
 \description{
 This function takes in input either the run results or the output from
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index c2b5579..e4721e6 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -5,6 +5,11 @@
 
 using namespace Rcpp;
 
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
 // fast_factor
 SEXP fast_factor(SEXP x);
 RcppExport SEXP _detectRUNS_fast_factor(SEXP xSEXP) {
diff --git a/detectRUNS/tests/testthat/test.tableRuns.csv b/detectRUNS/tests/testthat/test.tableRuns.csv
new file mode 100644
index 0000000..fa6d084
--- /dev/null
+++ b/detectRUNS/tests/testthat/test.tableRuns.csv
@@ -0,0 +1,4 @@
+"Group";"Start_SNP";"End_SNP";"chrom";"nSNP";"from";"to";"avg_pct"
+"Jacobs";"s07830.1";"s20481.1";"24";9;16041579;16811910;60
+"Jacobs";"s15456.1";"s30694.1";"24";12;31774063;32349389;60
+"Jacobs";"s59571.1";"s13600.1";"24";6;33056732;33397677;60
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
new file mode 100644
index 0000000..7a9dcc0
--- /dev/null
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -0,0 +1,21 @@
+library(testthat)
+library(detectRUNS)
+context("Testing Stats")
+
+# get file paths
+genotypeFile  <- "test.ped"
+mapFile <- "test.map"
+rawFile <- "test.raw"
+runsFile <- "test.ROHet.consecutive.csv"
+
+test_that("Test tableRuns", {
+  colClasses <- c("factor", "character", "character", "character", "numeric",
+                  "integer", "integer", "numeric")
+  reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
+  levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
+  
+  runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+  test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
+  
+  expect_equal(reference, test)
+})

From 0362a333cfa42f7dcb058cafbbf605bad6bd320b Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 28 Mar 2022 19:21:44 +0200
Subject: [PATCH 02/23] :fire: remove SnpInRuns from tableRuns parameters

snpInsideRuns(Cpp) is now internal
---
 detectRUNS/R/Stats.R                | 79 +++++++++++++----------------
 detectRUNS/man/tableRuns.Rd         | 18 ++-----
 performance/test-snpInsideRunsCpp.R | 26 ++++++++++
 3 files changed, 64 insertions(+), 59 deletions(-)
 create mode 100644 performance/test-snpInsideRunsCpp.R

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index c1d7cf3..f076675 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -397,10 +397,9 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 
 #' Function to retrieve most common runs in the population
 #'
-#' This function takes in input either the run results or the output from
-#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
-#' in the population/group, and returns a subset of the runs most commonly
-#' found in the group/population. The parameter \code{threshold} controls the definition
+#' This function takes in input either the run results and returns a subset of 
+#' the runs most commonly found in the group/population. The parameter 
+#' \code{threshold} controls the definition
 #' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
 #'
 #' @param genotypeFile Plink ped file (for SNP position)
@@ -408,8 +407,6 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 #' @param runs R object (dataframe) with results on detected runs
 #' @param threshold value from 0 to 1 (default 0.7) that controls the desired
 #' proportion of individuals carrying that run (e.g. 70\%)
-#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a
-#' run in the population (output from \code{snpInsideRuns})
 #'
 #' @return A dataframe with the most common runs detected in the sampled individuals
 #' (the group/population, start and end position of the run, chromosome, number of SNP
@@ -435,7 +432,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 #' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 #'
 
-tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) {
+tableRuns <- function(runs=NULL, genotypeFile, mapFile, threshold = 0.5) {
 
   #set a threshold
   threshold_used=threshold*100
@@ -444,45 +441,37 @@ tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold
   # read map file
   mappa <- readMapFile(mapFile)
 
-  if(!is.null(runs) & is.null(SnpInRuns)){
-    message('I found only Runs data frame. GOOD!')
-
-    #change colnames in runs file
-    names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
-
-    #Start calculation % SNP in ROH
-    message("Calculation % SNP in ROH") #FILIPPO
-    all_SNPinROH <- data.frame("SNP_NAME"=character(),
-                               "CHR"=integer(),
-                               "POSITION"=numeric(),
-                               "COUNT"=integer(),
-                               "BREED"=factor(),
-                               "PERCENTAGE"=numeric(),
-                               stringsAsFactors=FALSE)
-
-    # create progress bar
-    total <- length(unique(runs$CHROMOSOME))
-    message(paste('Chromosome founds: ',total)) #FILIPPO
-    n=0
-    pb <- txtProgressBar(min = 0, max = total, style = 3)
-
-    #SNP in ROH
-    for (chrom in sort(unique(runs$CHROMOSOME))) {
-      runsChrom <- runs[runs$CHROMOSOME==chrom,]
-      mapKrom <- mappa[mappa$CHR==chrom,]
-      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
-      all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
-      n=n+1
-      setTxtProgressBar(pb, n)
-    }
-    close(pb)
-    message("Calculation % SNP in ROH finish") #FILIPPO
-  } else if (is.null(runs) & !is.null(SnpInRuns)) {
-    message('I found only SNPinRuns data frame. GOOD!')
-    all_SNPinROH=SnpInRuns
-  } else{
-    stop('You gave me Runs and SNPinRuns! Please choose one!')
+  #change colnames in runs file
+  names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
+
+  #Start calculation % SNP in ROH
+  message("Calculation % SNP in ROH") #FILIPPO
+  all_SNPinROH <- data.frame("SNP_NAME"=character(),
+                             "CHR"=integer(),
+                             "POSITION"=numeric(),
+                             "COUNT"=integer(),
+                             "BREED"=factor(),
+                             "PERCENTAGE"=numeric(),
+                             stringsAsFactors=FALSE)
+
+  # create progress bar
+  total <- length(unique(runs$CHROMOSOME))
+  message(paste('Chromosome founds: ',total)) #FILIPPO
+  n=0
+  pb <- txtProgressBar(min = 0, max = total, style = 3)
+
+  #SNP in ROH
+  for (chrom in sort(unique(runs$CHROMOSOME))) {
+    runsChrom <- runs[runs$CHROMOSOME==chrom,]
+    mapKrom <- mappa[mappa$CHR==chrom,]
+    snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
+    all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
+    n=n+1
+    setTxtProgressBar(pb, n)
   }
+  close(pb)
+  
+  message("Calculation % SNP in ROH finish") #FILIPPO
 
   #consecutive number
   all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))
diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd
index 1e2bf7d..28dfec8 100644
--- a/detectRUNS/man/tableRuns.Rd
+++ b/detectRUNS/man/tableRuns.Rd
@@ -4,20 +4,11 @@
 \alias{tableRuns}
 \title{Function to retrieve most common runs in the population}
 \usage{
-tableRuns(
-  runs = NULL,
-  SnpInRuns = NULL,
-  genotypeFile,
-  mapFile,
-  threshold = 0.5
-)
+tableRuns(runs = NULL, genotypeFile, mapFile, threshold = 0.5)
 }
 \arguments{
 \item{runs}{R object (dataframe) with results on detected runs}
 
-\item{SnpInRuns}{dataframe with the proportion of times each SNP falls inside a
-run in the population (output from \code{snpInsideRuns})}
-
 \item{genotypeFile}{Plink ped file (for SNP position)}
 
 \item{mapFile}{Plink map file (for SNP position)}
@@ -32,10 +23,9 @@ included in the run and average percentage of SNPs in run
 are reported in the output dataframe)
 }
 \description{
-This function takes in input either the run results or the output from
-the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
-in the population/group, and returns a subset of the runs most commonly
-found in the group/population. The parameter \code{threshold} controls the definition
+This function takes in input either the run results and returns a subset of 
+the runs most commonly found in the group/population. The parameter 
+\code{threshold} controls the definition
 of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
 }
 \examples{
diff --git a/performance/test-snpInsideRunsCpp.R b/performance/test-snpInsideRunsCpp.R
new file mode 100644
index 0000000..a59ec19
--- /dev/null
+++ b/performance/test-snpInsideRunsCpp.R
@@ -0,0 +1,26 @@
+
+# get files path
+mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package="detectRUNS")
+genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS")
+runsfile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
+
+# read mapfile with custom methods
+mappa <- readMapFile(mapFile)
+
+# this is the chromosome I want to test
+chrom <- "24"
+
+# subsetting mapChrom (get x random snps from mapfile)
+set.seed(42)
+mapChrom <- mappa[mappa$CHR==chrom, ]
+mapChrom <- mapChrom[sort(sample(nrow(mapChrom), 10)), ]
+
+# loading pre-calculated data
+runs <- readExternalRuns(runsfile, program="detectRUNS")
+names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
+runsChrom <- runs[runs$CHROMOSOME==chrom, ]
+
+# get snps inside runs
+for (i in 1:100000) {
+  test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)  
+}

From 9c74576f46ac05fad13e786ad419b36c8c547711 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 28 Mar 2022 20:06:30 +0200
Subject: [PATCH 03/23] :zap: calculate tableRuns by chrom

---
 detectRUNS/R/Stats.R        | 193 ++++++++++++++++++------------------
 detectRUNS/man/tableRuns.Rd |  14 +--
 2 files changed, 103 insertions(+), 104 deletions(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index f076675..7c82365 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -397,8 +397,8 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 
 #' Function to retrieve most common runs in the population
 #'
-#' This function takes in input either the run results and returns a subset of 
-#' the runs most commonly found in the group/population. The parameter 
+#' This function takes in input either the run results and returns a subset of
+#' the runs most commonly found in the group/population. The parameter
 #' \code{threshold} controls the definition
 #' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
 #'
@@ -422,129 +422,126 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 #' # calculating runs of Homozygosity
 #' \dontrun{
 #' # skipping runs calculation
-#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1,  minSNP = 15,
-#' ROHet = FALSE,  maxOppositeGenotype = 1, maxMiss = 1,  minLengthBps = 100000,  minDensity = 1/10000)
+#' runs <- slidingRUNS.run(genotypeFile, mapFile,
+#'   windowSize = 15, threshold = 0.1, minSNP = 15,
+#'   ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+#' )
 #' }
 #' # loading pre-calculated data
-#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
-#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
 #'
 #' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 #'
+tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
 
-tableRuns <- function(runs=NULL, genotypeFile, mapFile, threshold = 0.5) {
-
-  #set a threshold
-  threshold_used=threshold*100
-  message(paste('Threshold used:',threshold_used))
+  # set a threshold
+  threshold_used <- threshold * 100
+  message(paste("Threshold used:", threshold_used))
 
   # read map file
   mappa <- readMapFile(mapFile)
 
-  #change colnames in runs file
-  names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
+  # change colnames in runs file
+  names(runs) <- c("POPULATION", "IND", "CHROMOSOME", "COUNT", "START", "END", "LENGTH")
 
-  #Start calculation % SNP in ROH
-  message("Calculation % SNP in ROH") #FILIPPO
-  all_SNPinROH <- data.frame("SNP_NAME"=character(),
-                             "CHR"=integer(),
-                             "POSITION"=numeric(),
-                             "COUNT"=integer(),
-                             "BREED"=factor(),
-                             "PERCENTAGE"=numeric(),
-                             stringsAsFactors=FALSE)
+  # vector of breeds
+  group_list <- as.vector(unique(runs$POPULATION))
+
+  # final data frame
+  final_table <- data.frame(
+    "GROUP" = character(0), "Start_SNP" = character(0), "End_SNP" = character(0),
+    "chrom" = character(0), "nSNP" = integer(0), "from" = integer(0), "to" = integer(0), "avg_pct" = numeric(0)
+  )
+
+  # Start calculation % SNP in ROH
+  message("Calculation % SNP in ROH") # FILIPPO
 
   # create progress bar
   total <- length(unique(runs$CHROMOSOME))
-  message(paste('Chromosome founds: ',total)) #FILIPPO
-  n=0
+  message(paste("Chromosome founds: ", total)) # FILIPPO
+  n <- 0
   pb <- txtProgressBar(min = 0, max = total, style = 3)
 
-  #SNP in ROH
+  # SNP in ROH
   for (chrom in sort(unique(runs$CHROMOSOME))) {
-    runsChrom <- runs[runs$CHROMOSOME==chrom,]
-    mapKrom <- mappa[mappa$CHR==chrom,]
-    snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
-    all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
-    n=n+1
-    setTxtProgressBar(pb, n)
-  }
-  close(pb)
-  
-  message("Calculation % SNP in ROH finish") #FILIPPO
-
-  #consecutive number
-  all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))
-
-  #final data frame
-  final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0),
-                            "chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0), "avg_pct"=numeric(0))
-
-
-  #vector of breeds
-  group_list=as.vector(unique(all_SNPinROH$BREED))
+    # extract from map and runs only the chromosome I need
+    runsChrom <- runs[runs$CHROMOSOME == chrom, ]
+    mapKrom <- mappa[mappa$CHR == chrom, ]
 
-  for (grp in group_list){
-    message(paste('checking: ',grp))
+    # calculate snpInsideRuns
+    snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile)
 
-    #create subset for group/thresold
-    group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,])
+    # add a column with the row names as integer
+    snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns))
 
-    #print(group_subset)
+    for (grp in group_list) {
+      # create subset for group/thresold
+      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE > threshold_used, ]
 
-    #variable
-    old_pos=group_subset[1,7]
-    snp_pos1=group_subset[1,3]
-    Start_SNP=group_subset[1,1]
-    snp_count=0
-    sum_pct = 0
-
-    x=2
-    while(x <= length(rownames(group_subset))) {
-
-      snp_count = snp_count + 1
-      new_pos=group_subset[x,7]
-      old_pos=group_subset[x-1,7]
-      chr_old=group_subset[x-1,2]
-      chr_new =group_subset[x,2]
-      sum_pct = sum_pct + group_subset[x-1,"PERCENTAGE"]
-
-      diff=new_pos-old_pos
+      # after filtering, I need to have at least 2 rows or I can't do the following stuff
+      if (nrow(group_subset) <= 2) {
+        break
+      }
 
-      if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) {
-        if (x==length(rownames(group_subset))){
-          end_SNP=group_subset[x,1]
-          TO=group_subset[x,3]
-        }else{
-          end_SNP=group_subset[x-1,1]
-          TO=group_subset[x-1,3]
+      # initialize stuff
+      old_pos <- group_subset[1, 7]
+      snp_pos1 <- group_subset[1, 3]
+      Start_SNP <- group_subset[1, 1]
+      snp_count <- 0
+      sum_pct <- 0
+
+      for (x in 2:nrow(group_subset)) {
+        snp_count <- snp_count + 1
+        new_pos <- group_subset[x, 7]
+        old_pos <- group_subset[x - 1, 7]
+        chr_old <- group_subset[x - 1, 2]
+        chr_new <- group_subset[x, 2]
+        sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"]
+
+        # this will be 1 in case of adjacent rows
+        diff <- new_pos - old_pos
+
+        if ((diff > 1) | (chr_new != chr_old) | x == nrow(group_subset)) {
+          if (x == nrow(group_subset)) {
+            end_SNP <- group_subset[x, 1]
+            TO <- group_subset[x, 3]
+          } else {
+            end_SNP <- group_subset[x - 1, 1]
+            TO <- group_subset[x - 1, 3]
+          }
+
+          final_table <- rbind.data.frame(final_table, final_table = data.frame(
+            "Group" = group_subset[x - 1, 5],
+            "Start_SNP" = Start_SNP,
+            "End_SNP" = end_SNP,
+            "chrom" = group_subset[x - 1, 2],
+            "nSNP" = snp_count,
+            "from" = snp_pos1,
+            "to" = TO,
+            "avg_pct" = sum_pct
+          ))
+
+          # reset variable
+          snp_count <- 0
+          sum_pct <- 0
+          snp_pos1 <- group_subset[x, 3]
+          Start_SNP <- group_subset[x, 1]
         }
-
-        final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5],
-                                                                           "Start_SNP"=Start_SNP,
-                                                                           "End_SNP"=end_SNP,
-                                                                           "chrom"=group_subset[x-1,2],
-                                                                           "nSNP"=snp_count,
-                                                                           "from"=snp_pos1,
-                                                                           "to"=TO,
-                                                                           "avg_pct"=sum_pct))
-
-        #reset variable
-        snp_count=0
-        sum_pct=0
-        snp_pos1=group_subset[x,3]
-        Start_SNP=group_subset[x,1]
       }
-
-      #upgrade x value
-      x <- x+1
-
     }
+
+    n <- n + 1
+    setTxtProgressBar(pb, n)
   }
 
-  final_table$avg_pct = final_table$avg_pct/final_table$nSNP
+  close(pb)
+
+  message("Calculation % SNP in ROH finish") # FILIPPO
+
+  final_table$avg_pct <- final_table$avg_pct / final_table$nSNP
+
+  row.names(final_table) <- seq(1, nrow(final_table))
 
-  rownames(final_table) <- seq(1,length(row.names(final_table)))
   return(final_table)
 }
-
diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd
index 28dfec8..39e34e1 100644
--- a/detectRUNS/man/tableRuns.Rd
+++ b/detectRUNS/man/tableRuns.Rd
@@ -23,8 +23,8 @@ included in the run and average percentage of SNPs in run
 are reported in the output dataframe)
 }
 \description{
-This function takes in input either the run results and returns a subset of 
-the runs most commonly found in the group/population. The parameter 
+This function takes in input either the run results and returns a subset of
+the runs most commonly found in the group/population. The parameter
 \code{threshold} controls the definition
 of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
 }
@@ -36,12 +36,14 @@ mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detec
 # calculating runs of Homozygosity
 \dontrun{
 # skipping runs calculation
-runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1,  minSNP = 15,
-ROHet = FALSE,  maxOppositeGenotype = 1, maxMiss = 1,  minLengthBps = 100000,  minDensity = 1/10000)
+runs <- slidingRUNS.run(genotypeFile, mapFile,
+  windowSize = 15, threshold = 0.1, minSNP = 15,
+  ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+)
 }
 # loading pre-calculated data
-runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
-runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
 
 tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 

From 99cccaae24d72243ae181cf7c8b7ddb6403f61be Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 29 Mar 2022 11:06:57 +0200
Subject: [PATCH 04/23] :construction_worker: attempt to fix coverage
 visualization

---
 .github/workflows/test-coverage.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
index 4727d86..8b0479d 100644
--- a/.github/workflows/test-coverage.yaml
+++ b/.github/workflows/test-coverage.yaml
@@ -29,6 +29,5 @@ jobs:
           working-directory: ./detectRUNS
 
       - name: Test coverage
-        run: covr::codecov()
+        run: covr::codecov(path="detectRUNS/")
         shell: Rscript {0}
-        working-directory: ./detectRUNS

From 0a7a24b3031c255287f5edf9254d95770413d227 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 29 Mar 2022 11:43:07 +0200
Subject: [PATCH 05/23] :bug: change tableRUNS threshold

check runs where SNP are major or equal the threshold specified
---
 detectRUNS/R/Stats.R                         | 10 ++++------
 detectRUNS/tests/testthat/test.tableRuns.csv |  7 ++++---
 detectRUNS/tests/testthat/test_Stats.R       |  4 ++--
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 7c82365..70bfcc1 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -17,7 +17,7 @@
 chromosomeLength <- function(mapFile){
   # read mapfile
   mappa <- readMapFile(mapFile)
-  
+
   maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0
 
   # defining NULL variables to avoid warning messages
@@ -355,7 +355,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 
     message("Calculating SNPs inside ROH")
     names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
-    
+
     #read map file
     mappa <- readMapFile(mapFile)
 
@@ -477,7 +477,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
 
     for (grp in group_list) {
       # create subset for group/thresold
-      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE > threshold_used, ]
+      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ]
 
       # after filtering, I need to have at least 2 rows or I can't do the following stuff
       if (nrow(group_subset) <= 2) {
@@ -495,14 +495,12 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
         snp_count <- snp_count + 1
         new_pos <- group_subset[x, 7]
         old_pos <- group_subset[x - 1, 7]
-        chr_old <- group_subset[x - 1, 2]
-        chr_new <- group_subset[x, 2]
         sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"]
 
         # this will be 1 in case of adjacent rows
         diff <- new_pos - old_pos
 
-        if ((diff > 1) | (chr_new != chr_old) | x == nrow(group_subset)) {
+        if ((diff > 1) | x == nrow(group_subset)) {
           if (x == nrow(group_subset)) {
             end_SNP <- group_subset[x, 1]
             TO <- group_subset[x, 3]
diff --git a/detectRUNS/tests/testthat/test.tableRuns.csv b/detectRUNS/tests/testthat/test.tableRuns.csv
index fa6d084..ed2d731 100644
--- a/detectRUNS/tests/testthat/test.tableRuns.csv
+++ b/detectRUNS/tests/testthat/test.tableRuns.csv
@@ -1,4 +1,5 @@
 "Group";"Start_SNP";"End_SNP";"chrom";"nSNP";"from";"to";"avg_pct"
-"Jacobs";"s07830.1";"s20481.1";"24";9;16041579;16811910;60
-"Jacobs";"s15456.1";"s30694.1";"24";12;31774063;32349389;60
-"Jacobs";"s59571.1";"s13600.1";"24";6;33056732;33397677;60
+"Jacobs";"s07830.1";"s46918.1";"24";17;16041579;17307132;55,2941176470588
+"Jacobs";"s52452.1";"OAR24_35715684.1";"24";19;31556086;32598869;56,3157894736842
+"Jacobs";"s37249.1";"OAR24_36594681.1";"24";15;32681863;33503397;54,6666666666667
+"Jacobs";"s62096.1";"OAR24_37414016.1";"24";8;33798780;34225118;50
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index 7a9dcc0..eb47a79 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -13,9 +13,9 @@ test_that("Test tableRuns", {
                   "integer", "integer", "numeric")
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
   levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
-  
+
   runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
-  
+
   expect_equal(reference, test)
 })

From a6e37d45ceb59eef77abf34dcc54360af8e3ec22 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Wed, 30 Mar 2022 14:45:22 +0200
Subject: [PATCH 06/23] :bug: fix bug in tableRuns

rewrite code to calculate percentages correctly
---
 detectRUNS/R/Stats.R | 65 ++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 70bfcc1..ab539c3 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -484,48 +484,67 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
         break
       }
 
+      # message("Start from: ", paste0(group_subset[1, ], sep = " "))
+
       # initialize stuff
-      old_pos <- group_subset[1, 7]
-      snp_pos1 <- group_subset[1, 3]
+      old_idx <- group_subset[1, 7]
+      from <- group_subset[1, 3]
       Start_SNP <- group_subset[1, 1]
-      snp_count <- 0
-      sum_pct <- 0
+      snp_count <- 1
+      sum_pct <- group_subset[1, "PERCENTAGE"]
 
       for (x in 2:nrow(group_subset)) {
-        snp_count <- snp_count + 1
-        new_pos <- group_subset[x, 7]
-        old_pos <- group_subset[x - 1, 7]
-        sum_pct <- sum_pct + group_subset[x - 1, "PERCENTAGE"]
+        # message("Processing: ", paste0(group_subset[x, ], sep = " "))
+
+        # get current index
+        new_idx <- group_subset[x, 7]
 
-        # this will be 1 in case of adjacent rows
-        diff <- new_pos - old_pos
+        # Difference between indexes. This will be 1 in case of adjacent rows
+        diff <- new_idx - old_idx
 
         if ((diff > 1) | x == nrow(group_subset)) {
           if (x == nrow(group_subset)) {
+            # message("End of subset")
             end_SNP <- group_subset[x, 1]
             TO <- group_subset[x, 3]
           } else {
+            # message("End of segment")
             end_SNP <- group_subset[x - 1, 1]
             TO <- group_subset[x - 1, 3]
           }
 
-          final_table <- rbind.data.frame(final_table, final_table = data.frame(
-            "Group" = group_subset[x - 1, 5],
-            "Start_SNP" = Start_SNP,
-            "End_SNP" = end_SNP,
-            "chrom" = group_subset[x - 1, 2],
-            "nSNP" = snp_count,
-            "from" = snp_pos1,
-            "to" = TO,
-            "avg_pct" = sum_pct
-          ))
+          # throw away row if composed by only one SNP
+          if (snp_count > 1) {
+            final_table <- rbind.data.frame(final_table, final_table = data.frame(
+              "Group" = group_subset[x - 1, 5],
+              "Start_SNP" = Start_SNP,
+              "End_SNP" = end_SNP,
+              "chrom" = group_subset[x - 1, 2],
+              "nSNP" = snp_count,
+              "from" = from,
+              "to" = TO,
+              "avg_pct" = sum_pct
+            ))
+
+            # message("Writing: ", paste0(tail(final_table, n = 1), sep = " "))
+            # message("Start a new segment: ", paste0(group_subset[x, ], sep = " "))
+          }
 
           # reset variable
-          snp_count <- 0
-          sum_pct <- 0
-          snp_pos1 <- group_subset[x, 3]
+          snp_count <- 1
+          sum_pct <- group_subset[x, "PERCENTAGE"]
+          from <- group_subset[x, 3]
           Start_SNP <- group_subset[x, 1]
+        } else {
+          # update stuff
+          snp_count <- snp_count + 1
+          sum_pct <- sum_pct + group_subset[x, "PERCENTAGE"]
+
+          # message("Add snp to group: ", paste(snp_count, new_idx, old_idx, sum_pct))
         }
+
+        # update index
+        old_idx <- new_idx
       }
     }
 

From 437619ed3d0cfdbafb0af8a80b6160138ea70ded Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Wed, 30 Mar 2022 22:31:09 +0200
Subject: [PATCH 07/23] :bug: fix bug in tableRuns filtering

correctly skip empty subset
---
 detectRUNS/R/Stats.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index ab539c3..8ae2235 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -481,7 +481,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
 
       # after filtering, I need to have at least 2 rows or I can't do the following stuff
       if (nrow(group_subset) <= 2) {
-        break
+        next
       }
 
       # message("Start from: ", paste0(group_subset[1, ], sep = " "))

From ca3d31e36a0bf04adab06242bc6f74544d6a4bfb Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 4 Apr 2022 12:04:19 +0200
Subject: [PATCH 08/23] :sparkles: start implementing tableRuns with Rcpp

---
 detectRUNS/DESCRIPTION         |   4 +-
 detectRUNS/NAMESPACE           |   1 +
 detectRUNS/R/RcppExports.R     |  47 ++++++++++
 detectRUNS/man/tableRunsCpp.Rd |  52 +++++++++++
 detectRUNS/src/RcppExports.cpp |  15 +++
 detectRUNS/src/functions.cpp   | 166 +++++++++++++++++++++++++++++++++
 6 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 detectRUNS/man/tableRunsCpp.Rd

diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index 56220c4..e4c980d 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,8 +2,8 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9001
-Date: 2020-12-20
+Version: 0.9.6.9002
+Date: 2022-04-04
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
     person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
diff --git a/detectRUNS/NAMESPACE b/detectRUNS/NAMESPACE
index 183f836..11039e4 100644
--- a/detectRUNS/NAMESPACE
+++ b/detectRUNS/NAMESPACE
@@ -16,6 +16,7 @@ export(reorderDF)
 export(slidingRUNS.run)
 export(summaryRuns)
 export(tableRuns)
+export(tableRunsCpp)
 import(ggplot2)
 import(itertools)
 import(plyr)
diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index b67a235..946d400 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -196,3 +196,50 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
     .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
 }
 
+#' Function to retrieve most common runs in the population
+#'
+#' This function takes in input either the run results and returns a subset of
+#' the runs most commonly found in the group/population. The parameter
+#' \code{threshold} controls the definition
+#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
+#'
+#' @param genotypeFile Plink ped file (for SNP position)
+#' @param mapFile Plink map file (for SNP position)
+#' @param runs R object (dataframe) with results on detected runs
+#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
+#' proportion of individuals carrying that run (e.g. 70\%)
+#'
+#' @return A dataframe with the most common runs detected in the sampled individuals
+#' (the group/population, start and end position of the run, chromosome, number of SNP
+#' included in the run and average percentage of SNPs in run
+#' are reported in the output dataframe)
+#' @export
+#'
+#' @examples
+#' # getting map and ped paths
+#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
+#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
+#'
+#' # calculating runs of Homozygosity
+#' \dontrun{
+#' # skipping runs calculation
+#' runs <- slidingRUNS.run(genotypeFile, mapFile,
+#'   windowSize = 15, threshold = 0.1, minSNP = 15,
+#'   ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+#' )
+#' }
+#' # loading pre-calculated data
+#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
+#'
+#' table <- tableRunsCpp(
+#'   runs = runsDF, genotypeFile = genotypeFile,
+#'   mapFile = mapFile, threshold = 0.5)
+#'
+#' @useDynLib detectRUNS
+#' @importFrom Rcpp sourceCpp
+#'
+tableRunsCpp <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
+    .Call('_detectRUNS_tableRunsCpp', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
+}
+
diff --git a/detectRUNS/man/tableRunsCpp.Rd b/detectRUNS/man/tableRunsCpp.Rd
new file mode 100644
index 0000000..7672a3a
--- /dev/null
+++ b/detectRUNS/man/tableRunsCpp.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{tableRunsCpp}
+\alias{tableRunsCpp}
+\title{Function to retrieve most common runs in the population}
+\usage{
+tableRunsCpp(runs, genotypeFile, mapFile, threshold = 0.5)
+}
+\arguments{
+\item{runs}{R object (dataframe) with results on detected runs}
+
+\item{genotypeFile}{Plink ped file (for SNP position)}
+
+\item{mapFile}{Plink map file (for SNP position)}
+
+\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired
+proportion of individuals carrying that run (e.g. 70\%)}
+}
+\value{
+A dataframe with the most common runs detected in the sampled individuals
+(the group/population, start and end position of the run, chromosome, number of SNP
+included in the run and average percentage of SNPs in run
+are reported in the output dataframe)
+}
+\description{
+This function takes in input either the run results and returns a subset of
+the runs most commonly found in the group/population. The parameter
+\code{threshold} controls the definition
+of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
+}
+\examples{
+# getting map and ped paths
+genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
+mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
+
+# calculating runs of Homozygosity
+\dontrun{
+# skipping runs calculation
+runs <- slidingRUNS.run(genotypeFile, mapFile,
+  windowSize = 15, threshold = 0.1, minSNP = 15,
+  ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+)
+}
+# loading pre-calculated data
+runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
+
+table <- tableRunsCpp(
+  runs = runsDF, genotypeFile = genotypeFile,
+  mapFile = mapFile, threshold = 0.5)
+
+}
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index e4721e6..862949d 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -159,6 +159,20 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// tableRunsCpp
+DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold);
+RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< DataFrame >::type runs(runsSEXP);
+    Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
+    Rcpp::traits::input_parameter< std::string >::type mapFile(mapFileSEXP);
+    Rcpp::traits::input_parameter< const float >::type threshold(thresholdSEXP);
+    rcpp_result_gen = Rcpp::wrap(tableRunsCpp(runs, genotypeFile, mapFile, threshold));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_fast_factor", (DL_FUNC) &_detectRUNS_fast_factor, 1},
@@ -172,6 +186,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1},
     {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9},
     {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3},
+    {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4},
     {NULL, NULL, 0}
 };
 
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 89b04ab..7279cde 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1013,3 +1013,169 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
   // returning all runs for this individual genotype
   return(res);
 }
+
+
+DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
+  CharacterVector population = runs[0];
+  CharacterVector ind = runs[1];
+  CharacterVector chromosomes = runs[2];
+  IntegerVector count = runs[3];
+  IntegerVector start = runs[4];
+  IntegerVector end = runs[5];
+  IntegerVector length = runs[6];
+
+  LogicalVector chrom_idx(chromosomes.size());
+
+  for (unsigned int j=0; j<chromosomes.size(); j++) {
+    chrom_idx[j] = (chromosomes[j] == chrom);
+  }
+
+  return DataFrame::create(
+    Named("POPULATION") = population[chrom_idx],
+    Named("IND") = ind[chrom_idx],
+    Named("CHROMOSOME") = chromosomes[chrom_idx],
+    Named("COUNT") = count[chrom_idx],
+    Named("START") = start[chrom_idx],
+    Named("END") = end[chrom_idx],
+    Named("LENGTH") = length[chrom_idx]
+  );
+}
+
+
+DataFrame subset_map_by_chrom(DataFrame mappa, std::string chrom){
+  CharacterVector chromosomes = mappa[0];
+  CharacterVector snps = mappa[1];
+  IntegerVector positions = mappa[2];
+
+  LogicalVector chrom_idx(chromosomes.size());
+
+  for (unsigned int j=0; j<chromosomes.size(); j++) {
+    chrom_idx[j] = (chromosomes[j] == chrom);
+  }
+
+  return DataFrame::create(
+    Named("CHR") = chromosomes[chrom_idx],
+    Named("SNP_NAME") = snps[chrom_idx],
+    Named("POSITION") = positions[chrom_idx]
+  );
+}
+
+
+DataFrame filter_snpInsideRuns_by_threshold(
+    DataFrame snpInsideRuns, float threshold){
+  CharacterVector snps = snpInsideRuns["SNP_NAME"];
+  CharacterVector chromosomes = snpInsideRuns["CHR"];
+  IntegerVector positions = snpInsideRuns["POSITION"];
+  IntegerVector counts = snpInsideRuns["COUNT"];
+  CharacterVector breeds = snpInsideRuns["BREED"];
+  NumericVector percentages = snpInsideRuns["PERCENTAGE"];
+  IntegerVector numbers = snpInsideRuns["Number"];
+
+  LogicalVector idx(percentages.size());
+
+  for (unsigned int j=0; j<percentages.size(); j++) {
+    idx[j] = (percentages[j] >= threshold);
+  }
+
+  return DataFrame::create(
+    Named("SNP_NAME") = snps[idx],
+    Named("CHR") = chromosomes[idx],
+    Named("POSITION") = positions[idx],
+    Named("COUNT") = counts[idx],
+    Named("BREED") = fast_factor(breeds[idx]), //returns a factor
+    Named("PERCENTAGE") = percentages[idx],
+    Named("Number") = numbers[idx],
+    _["stringsAsFactors"] = false);
+}
+
+
+//' Function to retrieve most common runs in the population
+//'
+//' This function takes in input either the run results and returns a subset of
+//' the runs most commonly found in the group/population. The parameter
+//' \code{threshold} controls the definition
+//' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
+//'
+//' @param genotypeFile Plink ped file (for SNP position)
+//' @param mapFile Plink map file (for SNP position)
+//' @param runs R object (dataframe) with results on detected runs
+//' @param threshold value from 0 to 1 (default 0.7) that controls the desired
+//' proportion of individuals carrying that run (e.g. 70\%)
+//'
+//' @return A dataframe with the most common runs detected in the sampled individuals
+//' (the group/population, start and end position of the run, chromosome, number of SNP
+//' included in the run and average percentage of SNPs in run
+//' are reported in the output dataframe)
+//' @export
+//'
+//' @examples
+//' # getting map and ped paths
+//' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
+//' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
+//'
+//' # calculating runs of Homozygosity
+//' \dontrun{
+//' # skipping runs calculation
+//' runs <- slidingRUNS.run(genotypeFile, mapFile,
+//'   windowSize = 15, threshold = 0.1, minSNP = 15,
+//'   ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
+//' )
+//' }
+//' # loading pre-calculated data
+//' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+//' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
+//'
+//' table <- tableRunsCpp(
+//'   runs = runsDF, genotypeFile = genotypeFile,
+//'   mapFile = mapFile, threshold = 0.5)
+//'
+//' @useDynLib detectRUNS
+//' @importFrom Rcpp sourceCpp
+//'
+// [[Rcpp::export]]
+DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile,
+                  const float threshold = 0.5) {
+
+  // set a threshold
+  float threshold_used = threshold * 100;
+  Rprintf("Threshold used: %f", threshold_used);
+
+  // Obtaining namespace of detectRUNS package
+  Environment pkg = Environment::namespace_env("detectRUNS");
+
+  // Picking up readMapFile function from detectRUNS package
+  Function readMapFile = pkg["readMapFile"];
+
+  // read mapfile using R function (which uses data.table::fread)
+  DataFrame mappa = readMapFile(mapFile);
+
+  // get distinct chromosomes
+  CharacterVector chromosomes = runs[2];
+  CharacterVector unique_chromosomes = sort_unique(chromosomes);
+
+  DataFrame snpInsideRuns;
+
+  for (unsigned int i=0; i<unique_chromosomes.size(); i++) {
+    std::string chrom = as<std::string>(unique_chromosomes[i]);
+
+    // extract the desidered chrom
+    DataFrame runsChrom = subset_runs_by_chrom(runs, chrom);
+    DataFrame mapKrom = subset_map_by_chrom(mappa, chrom);
+
+    // calculate snpInsideRuns
+    snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
+
+    // add a column with the row names as integer
+    IntegerVector Number = seq(1, snpInsideRuns.nrows());
+    snpInsideRuns.push_back(Number, "Number");
+
+    // now filter snpInsideRuns relying on threshold
+    snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
+
+    // debug
+    break;
+  }
+
+  return snpInsideRuns;
+
+}

From 7db05eb32358f2e820151f69c2b4f97add7b7004 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 4 Apr 2022 16:45:58 +0200
Subject: [PATCH 09/23] :sparkles: implement tableRuns in cpp

---
 detectRUNS/R/Stats.R         |   4 +-
 detectRUNS/src/functions.cpp | 199 ++++++++++++++++++++++++++++++++---
 2 files changed, 185 insertions(+), 18 deletions(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 8ae2235..cdbf55e 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -480,7 +480,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
       group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ]
 
       # after filtering, I need to have at least 2 rows or I can't do the following stuff
-      if (nrow(group_subset) <= 2) {
+      if (nrow(group_subset) < 2) {
         next
       }
 
@@ -527,10 +527,10 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
             ))
 
             # message("Writing: ", paste0(tail(final_table, n = 1), sep = " "))
-            # message("Start a new segment: ", paste0(group_subset[x, ], sep = " "))
           }
 
           # reset variable
+          # message("Start a new segment: ", paste0(group_subset[x, ], sep = " "))
           snp_count <- 1
           sum_pct <- group_subset[x, "PERCENTAGE"]
           from <- group_subset[x, 3]
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 7279cde..fad9804 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1026,11 +1026,13 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
 
   LogicalVector chrom_idx(chromosomes.size());
 
+  Rprintf("Got %d runs\n", runs.nrows());
+
   for (unsigned int j=0; j<chromosomes.size(); j++) {
     chrom_idx[j] = (chromosomes[j] == chrom);
   }
 
-  return DataFrame::create(
+  DataFrame res = DataFrame::create(
     Named("POPULATION") = population[chrom_idx],
     Named("IND") = ind[chrom_idx],
     Named("CHROMOSOME") = chromosomes[chrom_idx],
@@ -1039,25 +1041,33 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
     Named("END") = end[chrom_idx],
     Named("LENGTH") = length[chrom_idx]
   );
+
+  Rprintf("%d runs remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
+  return res;
 }
 
 
 DataFrame subset_map_by_chrom(DataFrame mappa, std::string chrom){
-  CharacterVector chromosomes = mappa[0];
-  CharacterVector snps = mappa[1];
-  IntegerVector positions = mappa[2];
+  CharacterVector chromosomes = mappa["CHR"];
+  CharacterVector snps = mappa["SNP_NAME"];
+  IntegerVector positions = mappa["POSITION"];
 
   LogicalVector chrom_idx(chromosomes.size());
 
+  Rprintf("Got %d snps\n", mappa.nrows());
+
   for (unsigned int j=0; j<chromosomes.size(); j++) {
-    chrom_idx[j] = (chromosomes[j] == chrom);
+    chrom_idx[j] = (as<std::string>(chromosomes[j]) == chrom);
   }
 
-  return DataFrame::create(
+  DataFrame res = DataFrame::create(
     Named("CHR") = chromosomes[chrom_idx],
     Named("SNP_NAME") = snps[chrom_idx],
     Named("POSITION") = positions[chrom_idx]
   );
+
+  Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
+  return res;
 }
 
 
@@ -1073,11 +1083,13 @@ DataFrame filter_snpInsideRuns_by_threshold(
 
   LogicalVector idx(percentages.size());
 
+  Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows());
+
   for (unsigned int j=0; j<percentages.size(); j++) {
     idx[j] = (percentages[j] >= threshold);
   }
 
-  return DataFrame::create(
+  DataFrame res = DataFrame::create(
     Named("SNP_NAME") = snps[idx],
     Named("CHR") = chromosomes[idx],
     Named("POSITION") = positions[idx],
@@ -1086,9 +1098,39 @@ DataFrame filter_snpInsideRuns_by_threshold(
     Named("PERCENTAGE") = percentages[idx],
     Named("Number") = numbers[idx],
     _["stringsAsFactors"] = false);
+
+  Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold);
+  return res;
 }
 
 
+DataFrame filter_snpInsideRuns_by_breed(
+    DataFrame snpInsideRuns, std::string breed){
+  CharacterVector snps = snpInsideRuns["SNP_NAME"];
+  CharacterVector chromosomes = snpInsideRuns["CHR"];
+  IntegerVector positions = snpInsideRuns["POSITION"];
+  IntegerVector counts = snpInsideRuns["COUNT"];
+  CharacterVector breeds = snpInsideRuns["BREED"];
+  NumericVector percentages = snpInsideRuns["PERCENTAGE"];
+  IntegerVector numbers = snpInsideRuns["Number"];
+
+  LogicalVector idx(breeds.size());
+
+  for (unsigned int j=0; j<breeds.size(); j++) {
+    idx[j] = (breeds[j] == breed);
+  }
+
+  return DataFrame::create(
+    Named("SNP_NAME") = snps[idx],
+    Named("CHR") = chromosomes[idx],
+    Named("POSITION") = positions[idx],
+    Named("COUNT") = counts[idx],
+    Named("BREED") = fast_factor(breeds[idx]), //returns a factor
+    Named("PERCENTAGE") = percentages[idx],
+    Named("Number") = numbers[idx],
+    _["stringsAsFactors"] = false);
+}
+
 //' Function to retrieve most common runs in the population
 //'
 //' This function takes in input either the run results and returns a subset of
@@ -1133,12 +1175,13 @@ DataFrame filter_snpInsideRuns_by_threshold(
 //' @importFrom Rcpp sourceCpp
 //'
 // [[Rcpp::export]]
-DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile,
-                  const float threshold = 0.5) {
+DataFrame tableRunsCpp(
+    DataFrame runs, std::string genotypeFile, std::string mapFile,
+    const float threshold = 0.5) {
 
   // set a threshold
   float threshold_used = threshold * 100;
-  Rprintf("Threshold used: %f", threshold_used);
+  Rprintf("Threshold used: %f\n", threshold_used);
 
   // Obtaining namespace of detectRUNS package
   Environment pkg = Environment::namespace_env("detectRUNS");
@@ -1153,17 +1196,29 @@ DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string map
   CharacterVector chromosomes = runs[2];
   CharacterVector unique_chromosomes = sort_unique(chromosomes);
 
-  DataFrame snpInsideRuns;
+  // These will be the output vectors
+  CharacterVector res_group;
+  CharacterVector res_start_snp;
+  CharacterVector res_end_snp;
+  CharacterVector res_chrom;
+  IntegerVector res_n_snp;
+  IntegerVector res_from;
+  IntegerVector res_to;
+  NumericVector res_avg_pct;
+
+  Rcout << "Calculation % SNP in ROH\n";
 
   for (unsigned int i=0; i<unique_chromosomes.size(); i++) {
     std::string chrom = as<std::string>(unique_chromosomes[i]);
 
-    // extract the desidered chrom
+    Rprintf("Processing %d/%d chromosome\n", i+1, unique_chromosomes.size());
+
+    // extract the desired chrom
     DataFrame runsChrom = subset_runs_by_chrom(runs, chrom);
     DataFrame mapKrom = subset_map_by_chrom(mappa, chrom);
 
     // calculate snpInsideRuns
-    snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
+    DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
 
     // add a column with the row names as integer
     IntegerVector Number = seq(1, snpInsideRuns.nrows());
@@ -1172,10 +1227,122 @@ DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string map
     // now filter snpInsideRuns relying on threshold
     snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
 
-    // debug
-    break;
+    // collect the distinct breeds
+    CharacterVector breeds = snpInsideRuns["BREED"];
+    CharacterVector group_list = unique(breeds);
+
+    for (unsigned int j=0; j<group_list.size(); j++) {
+      std::string grp = as<std::string>(group_list[j]);
+
+      Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str());
+
+      DataFrame group_subset = filter_snpInsideRuns_by_breed(snpInsideRuns, grp);
+
+      // after filtering, I need to have at least 2 rows or I can't do the following stuff
+      if (group_subset.nrows() < 2) {
+        continue;
+      }
+
+      // initialize stuff
+      IntegerVector idxs = group_subset["Number"];
+      IntegerVector positions = group_subset["POSITION"];
+      CharacterVector snp_names = group_subset["SNP_NAME"];
+      NumericVector sum_pcts = group_subset["PERCENTAGE"];
+
+      int old_idx = idxs[0];
+      int from = positions[0];
+      String Start_SNP = snp_names[0];
+      int snp_count = 1;
+      double sum_pct = sum_pcts[0];
+
+      Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from);
+      Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
+      Rprintf("sum_pct: %f\n", sum_pct);
+
+      for (unsigned int x=1; x<group_subset.nrows(); x++) {
+        Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
+        Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
+
+        // get current index
+        int new_idx = idxs[x];
+
+        // Difference between indexes. This will be 1 in case of adjacent rows
+        int diff = new_idx - old_idx;
+
+        // declare other variables
+        String end_SNP;
+        int TO;
+
+        if ((diff > 1) | (x == group_subset.nrows()-1)) {
+          if (x == group_subset.nrows()-1) {
+            Rcout << "End of subset\n";
+            end_SNP = snp_names[x];
+            TO = positions[x];
+          } else {
+            Rcout << "End of segment\n";
+            end_SNP = snp_names[x-1];
+            TO = positions[x-1];
+          }
+
+          // throw away row if composed by only one SNP
+          if (snp_count > 1) {
+            res_group.push_back(grp);
+            res_start_snp.push_back(Start_SNP);
+            res_end_snp.push_back(end_SNP);
+            res_chrom.push_back(chrom);
+            res_n_snp.push_back(snp_count);
+            res_from.push_back(from);
+            res_to.push_back(TO);
+            res_avg_pct.push_back(sum_pct);
+
+            Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring());
+            Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str());
+            Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO);
+            Rprintf("sum_pct: %d\n", sum_pct);
+          }
+
+          // reset variable
+          Rprintf("Start a new segment: %s, ", as<std::string>(snp_names[x]).c_str());
+          Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
+
+          snp_count = 1;
+          sum_pct = sum_pcts[x];
+          from = positions[x];
+          Start_SNP = snp_names[x];
+        } else {
+          // update stuff
+          snp_count++;
+          sum_pct += sum_pcts[x];
+
+          Rprintf("Add %s to group ", as<std::string>(snp_names[x]).c_str());
+          Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct);
+        }
+
+        // update index
+        old_idx = new_idx;
+      } // cycle for snpInsideRuns
+
+    } // cycle for group (breed)
+
+  } // cycle for chromosome
+
+  // calculate the avg percentage
+  for (unsigned int i=0; i<res_avg_pct.size(); i++) {
+    res_avg_pct[i] = res_avg_pct[i] / res_n_snp[i];
   }
 
-  return snpInsideRuns;
+  Rcout << "Calculation % SNP in ROH finish\n";
+
+  return DataFrame::create(
+    Named("Group") = res_group,
+    Named("Start_SNP") = res_start_snp,
+    Named("End_SNP") = res_end_snp,
+    Named("chrom") = res_chrom,
+    Named("nSNP") = res_n_snp,
+    Named("from") = res_from,
+    Named("to") = res_to,
+    Named("avg_pct") = res_avg_pct,
+    _["stringsAsFactors"] = false
+  );
 
 }

From 2ca0e5a3c86edf490ccecb2f5bb49b319f762c25 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 4 Apr 2022 16:55:38 +0200
Subject: [PATCH 10/23] :white_check_mark: test tableRunsCpp

---
 detectRUNS/src/functions.cpp           | 46 +++++++++++++-------------
 detectRUNS/tests/testthat/test_Stats.R | 11 ++++++
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index fad9804..87fd31d 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1026,7 +1026,7 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
 
   LogicalVector chrom_idx(chromosomes.size());
 
-  Rprintf("Got %d runs\n", runs.nrows());
+  // Rprintf("Got %d runs\n", runs.nrows());
 
   for (unsigned int j=0; j<chromosomes.size(); j++) {
     chrom_idx[j] = (chromosomes[j] == chrom);
@@ -1042,7 +1042,7 @@ DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
     Named("LENGTH") = length[chrom_idx]
   );
 
-  Rprintf("%d runs remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
+  // Rprintf("%d runs remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
   return res;
 }
 
@@ -1054,7 +1054,7 @@ DataFrame subset_map_by_chrom(DataFrame mappa, std::string chrom){
 
   LogicalVector chrom_idx(chromosomes.size());
 
-  Rprintf("Got %d snps\n", mappa.nrows());
+  // Rprintf("Got %d snps\n", mappa.nrows());
 
   for (unsigned int j=0; j<chromosomes.size(); j++) {
     chrom_idx[j] = (as<std::string>(chromosomes[j]) == chrom);
@@ -1066,7 +1066,7 @@ DataFrame subset_map_by_chrom(DataFrame mappa, std::string chrom){
     Named("POSITION") = positions[chrom_idx]
   );
 
-  Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
+  // Rprintf("%d snps remained after filtering for chrom %s\n", res.nrows(), chrom.c_str());
   return res;
 }
 
@@ -1083,7 +1083,7 @@ DataFrame filter_snpInsideRuns_by_threshold(
 
   LogicalVector idx(percentages.size());
 
-  Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows());
+  // Rprintf("Got %d snpInsideRuns\n", snpInsideRuns.nrows());
 
   for (unsigned int j=0; j<percentages.size(); j++) {
     idx[j] = (percentages[j] >= threshold);
@@ -1099,7 +1099,7 @@ DataFrame filter_snpInsideRuns_by_threshold(
     Named("Number") = numbers[idx],
     _["stringsAsFactors"] = false);
 
-  Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold);
+  // Rprintf("%d snpInsideRuns remained after filtering for threshold %f\n", res.nrows(), threshold);
   return res;
 }
 
@@ -1234,7 +1234,7 @@ DataFrame tableRunsCpp(
     for (unsigned int j=0; j<group_list.size(); j++) {
       std::string grp = as<std::string>(group_list[j]);
 
-      Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str());
+      // Rprintf("Processing breed: %s, chromosome: %s", grp.c_str(), chrom.c_str());
 
       DataFrame group_subset = filter_snpInsideRuns_by_breed(snpInsideRuns, grp);
 
@@ -1255,13 +1255,13 @@ DataFrame tableRunsCpp(
       int snp_count = 1;
       double sum_pct = sum_pcts[0];
 
-      Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from);
-      Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
-      Rprintf("sum_pct: %f\n", sum_pct);
+      // Rprintf("Starting from old_idx: %d, from: %d, ", old_idx, from);
+      // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
+      // Rprintf("sum_pct: %f\n", sum_pct);
 
       for (unsigned int x=1; x<group_subset.nrows(); x++) {
-        Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
-        Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
+        // Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
+        // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
 
         // get current index
         int new_idx = idxs[x];
@@ -1275,11 +1275,11 @@ DataFrame tableRunsCpp(
 
         if ((diff > 1) | (x == group_subset.nrows()-1)) {
           if (x == group_subset.nrows()-1) {
-            Rcout << "End of subset\n";
+            // Rcout << "End of subset\n";
             end_SNP = snp_names[x];
             TO = positions[x];
           } else {
-            Rcout << "End of segment\n";
+            // Rcout << "End of segment\n";
             end_SNP = snp_names[x-1];
             TO = positions[x-1];
           }
@@ -1295,15 +1295,15 @@ DataFrame tableRunsCpp(
             res_to.push_back(TO);
             res_avg_pct.push_back(sum_pct);
 
-            Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring());
-            Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str());
-            Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO);
-            Rprintf("sum_pct: %d\n", sum_pct);
+            // Rprintf("Writing group: %s, Start_SNP: %s, ", grp.c_str(), Start_SNP.get_cstring());
+            // Rprintf("end_SNP: %s, chrom: %s, ", end_SNP.get_cstring(), chrom.c_str());
+            // Rprintf("snp_count: %d, from: %d, TO: %d, ", snp_count, from, TO);
+            // Rprintf("sum_pct: %d\n", sum_pct);
           }
 
           // reset variable
-          Rprintf("Start a new segment: %s, ", as<std::string>(snp_names[x]).c_str());
-          Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
+          // Rprintf("Start a new segment: %s, ", as<std::string>(snp_names[x]).c_str());
+          // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
 
           snp_count = 1;
           sum_pct = sum_pcts[x];
@@ -1314,8 +1314,8 @@ DataFrame tableRunsCpp(
           snp_count++;
           sum_pct += sum_pcts[x];
 
-          Rprintf("Add %s to group ", as<std::string>(snp_names[x]).c_str());
-          Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct);
+          // Rprintf("Add %s to group ", as<std::string>(snp_names[x]).c_str());
+          // Rprintf("(count: %d, sum_pct: %f)\n", snp_count, sum_pct);
         }
 
         // update index
@@ -1334,7 +1334,7 @@ DataFrame tableRunsCpp(
   Rcout << "Calculation % SNP in ROH finish\n";
 
   return DataFrame::create(
-    Named("Group") = res_group,
+    Named("Group") = fast_factor(res_group),
     Named("Start_SNP") = res_start_snp,
     Named("End_SNP") = res_end_snp,
     Named("chrom") = res_chrom,
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index eb47a79..8864381 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -19,3 +19,14 @@ test_that("Test tableRuns", {
 
   expect_equal(reference, test)
 })
+
+test_that("Test tableRunsCpp", {
+  colClasses <- c("factor", "character", "character", "character", "numeric",
+                  "integer", "integer", "numeric")
+  reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
+
+  runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+  test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
+
+  expect_equal(reference, test)
+})

From 3c7d23598568c6ed3ee578734f0b24d4bf5a1928 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 4 Apr 2022 17:39:28 +0200
Subject: [PATCH 11/23] :bug: skip tableRuns when no data available

---
 detectRUNS/src/functions.cpp | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 87fd31d..98b033a 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1211,15 +1211,34 @@ DataFrame tableRunsCpp(
   for (unsigned int i=0; i<unique_chromosomes.size(); i++) {
     std::string chrom = as<std::string>(unique_chromosomes[i]);
 
-    Rprintf("Processing %d/%d chromosome\n", i+1, unique_chromosomes.size());
+    Rprintf(
+      "Processing chromosome '%s'  (%d/%d)\n",
+      chrom.c_str(),
+      i+1, unique_chromosomes.size());
 
     // extract the desired chrom
     DataFrame runsChrom = subset_runs_by_chrom(runs, chrom);
+
+    if (runsChrom.nrows() == 0) {
+      REprintf("No runs on chromosome %s\n", chrom.c_str());
+      continue;
+    }
+
     DataFrame mapKrom = subset_map_by_chrom(mappa, chrom);
 
+    if (mapKrom.nrows() == 0) {
+      REprintf("No SNPs data for chromosome %s\n", chrom.c_str());
+      continue;
+    }
+
     // calculate snpInsideRuns
     DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
 
+    if (snpInsideRuns.nrows() == 0) {
+      REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str());
+      continue;
+    }
+
     // add a column with the row names as integer
     IntegerVector Number = seq(1, snpInsideRuns.nrows());
     snpInsideRuns.push_back(Number, "Number");
@@ -1227,6 +1246,13 @@ DataFrame tableRunsCpp(
     // now filter snpInsideRuns relying on threshold
     snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
 
+    if (snpInsideRuns.nrows() == 0) {
+      REprintf(
+        "No SNPs in runs after filtering %f for chromosome %s\n",
+        threshold_used,  chrom.c_str());
+      continue;
+    }
+
     // collect the distinct breeds
     CharacterVector breeds = snpInsideRuns["BREED"];
     CharacterVector group_list = unique(breeds);

From c78177a76564241bdbea31a513c27e1fba20ebfe Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Mon, 4 Apr 2022 17:47:12 +0200
Subject: [PATCH 12/23] :recycle: refactor snpInsideRunsCpp internal variables

no more copy values from dataframe
---
 detectRUNS/src/functions.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 98b033a..aa1a401 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -921,10 +921,10 @@ void Runs::dumpRuns() {
 DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
                            std::string genotypeFile) {
 
-  // transform R object in Cpp object
-  std::vector<int> POSITIONS = as<std::vector<int> >(mapChrom["POSITION"]);
-  std::vector<std::string> SNP_NAME = as<std::vector<std::string> >(mapChrom["SNP_NAME"]);
-  std::vector<std::string> CHR = as<std::vector<std::string> >(mapChrom["CHR"]);
+  // get columns as vectors
+  IntegerVector POSITIONS = mapChrom["POSITION"];
+  CharacterVector SNP_NAME = mapChrom["SNP_NAME"];
+  CharacterVector CHR = mapChrom["CHR"];
 
   // get unique breeds
   CharacterVector population = runsChrom["POPULATION"];
@@ -1212,7 +1212,7 @@ DataFrame tableRunsCpp(
     std::string chrom = as<std::string>(unique_chromosomes[i]);
 
     Rprintf(
-      "Processing chromosome '%s'  (%d/%d)\n",
+      "Processing chromosome '%s' (%d/%d)\n",
       chrom.c_str(),
       i+1, unique_chromosomes.size());
 

From 677fcad41898ace90371ee6fb8f4bfe9cd17030f Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 12:01:53 +0200
Subject: [PATCH 13/23] :recycle: filter tableRuns by threshold before
 evaluating breeds

---
 detectRUNS/R/Stats.R                   |  7 +++++--
 detectRUNS/tests/testthat/test_Stats.R | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index cdbf55e..0ae1597 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -475,9 +475,12 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
     # add a column with the row names as integer
     snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns))
 
+    # filter by threshold once
+    snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
+
     for (grp in group_list) {
-      # create subset for group/thresold
-      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp) & snpInsideRuns$PERCENTAGE >= threshold_used, ]
+      # create subset for group
+      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp), ]
 
       # after filtering, I need to have at least 2 rows or I can't do the following stuff
       if (nrow(group_subset) < 2) {
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index 8864381..fe6a2b6 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -3,29 +3,33 @@ library(detectRUNS)
 context("Testing Stats")
 
 # get file paths
-genotypeFile  <- "test.ped"
+genotypeFile <- "test.ped"
 mapFile <- "test.map"
 rawFile <- "test.raw"
 runsFile <- "test.ROHet.consecutive.csv"
 
 test_that("Test tableRuns", {
-  colClasses <- c("factor", "character", "character", "character", "numeric",
-                  "integer", "integer", "numeric")
+  colClasses <- c(
+    "factor", "character", "character", "character", "numeric",
+    "integer", "integer", "numeric"
+  )
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
   levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
 
-  runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+  runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 
   expect_equal(reference, test)
 })
 
 test_that("Test tableRunsCpp", {
-  colClasses <- c("factor", "character", "character", "character", "numeric",
-                  "integer", "integer", "numeric")
+  colClasses <- c(
+    "factor", "character", "character", "character", "numeric",
+    "integer", "integer", "numeric"
+  )
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
 
-  runsDF <- readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
+  runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
   test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 
   expect_equal(reference, test)

From 84eb6e6c2a1191b1cebefaa8c9b7ded4e1e548f0 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 12:22:29 +0200
Subject: [PATCH 14/23] :monocle_face: evaluate tableRuns performance in Cpp
 and R

---
 performance/test2.R | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 performance/test2.R

diff --git a/performance/test2.R b/performance/test2.R
new file mode 100644
index 0000000..51cf936
--- /dev/null
+++ b/performance/test2.R
@@ -0,0 +1,26 @@
+
+# clean up
+rm(list = ls())
+
+# importing libraries
+library(detectRUNS)
+library(microbenchmark)
+
+genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
+mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
+runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
+
+# read runs
+runs <- readExternalRuns(runsFile, program = "detectRUNS")
+
+# how many times perform test
+times <- 100
+
+test_tableRuns <- microbenchmark(
+  R = tableRuns(runs, mapFile = mapFile, genotypeFile = genotypeFile),
+  Cpp = tableRunsCpp(runs, mapFile = mapFile, genotypeFile = genotypeFile),
+  unit = 'ms',
+  times = times
+)
+
+boxplot(test_tableRuns)

From d05ef4de01eba5996a0d78411e59159135dd5a56 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 14:56:18 +0200
Subject: [PATCH 15/23] :poop: duplicate snpInsideRunsCpp to ensure backward
 compatibility

---
 detectRUNS/R/RcppExports.R          |  20 +++++
 detectRUNS/R/Stats.R                |   5 +-
 detectRUNS/man/countSnpInRunsCpp.Rd |  22 +++++
 detectRUNS/src/RcppExports.cpp      |  14 +++
 detectRUNS/src/functions.cpp        | 129 ++++++++++++++++++++++++++--
 5 files changed, 180 insertions(+), 10 deletions(-)
 create mode 100644 detectRUNS/man/countSnpInRunsCpp.Rd

diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index 946d400..03466ec 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -196,6 +196,26 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
     .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
 }
 
+#' Function to count number of times a SNP is in a RUN
+#'
+#' Similar to snpInsideRunsCpp, it apply additional column to dataframe.
+#' Require to ensure backward compatibility.
+#'
+#' @param runsChrom R object (dataframe) with results per chromosome
+#' @param mapChrom R map object with SNP per chromosome
+#' @param genotypeFile genotype (.ped) file location
+#'
+#' @return dataframe with counts per SNP in runs (per population)
+#'
+#' @import utils
+#'
+#' @useDynLib detectRUNS
+#' @importFrom Rcpp sourceCpp
+#'
+countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
+    .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
+}
+
 #' Function to retrieve most common runs in the population
 #'
 #' This function takes in input either the run results and returns a subset of
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 0ae1597..9e740ce 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -470,10 +470,7 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
     mapKrom <- mappa[mappa$CHR == chrom, ]
 
     # calculate snpInsideRuns
-    snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile)
-
-    # add a column with the row names as integer
-    snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns))
+    snpInsideRuns <- countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile)
 
     # filter by threshold once
     snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd
new file mode 100644
index 0000000..56780ed
--- /dev/null
+++ b/detectRUNS/man/countSnpInRunsCpp.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{countSnpInRunsCpp}
+\alias{countSnpInRunsCpp}
+\title{Function to count number of times a SNP is in a RUN}
+\usage{
+countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile)
+}
+\arguments{
+\item{runsChrom}{R object (dataframe) with results per chromosome}
+
+\item{mapChrom}{R map object with SNP per chromosome}
+
+\item{genotypeFile}{genotype (.ped) file location}
+}
+\value{
+dataframe with counts per SNP in runs (per population)
+}
+\description{
+Similar to snpInsideRunsCpp, it apply additional column to dataframe.
+Require to ensure backward compatibility.
+}
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index 862949d..b4252f5 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -159,6 +159,19 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// countSnpInRunsCpp
+DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile);
+RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP);
+    Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP);
+    Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
+    rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile));
+    return rcpp_result_gen;
+END_RCPP
+}
 // tableRunsCpp
 DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold);
 RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) {
@@ -186,6 +199,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1},
     {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9},
     {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3},
+    {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 3},
     {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4},
     {NULL, NULL, 0}
 };
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index aa1a401..62960fe 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1015,6 +1015,127 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
 }
 
 
+//' Function to count number of times a SNP is in a RUN
+//'
+//' Similar to snpInsideRunsCpp, it apply additional column to dataframe.
+//' Require to ensure backward compatibility.
+//'
+//' @param runsChrom R object (dataframe) with results per chromosome
+//' @param mapChrom R map object with SNP per chromosome
+//' @param genotypeFile genotype (.ped) file location
+//'
+//' @return dataframe with counts per SNP in runs (per population)
+//'
+//' @import utils
+//'
+//' @useDynLib detectRUNS
+//' @importFrom Rcpp sourceCpp
+//'
+// [[Rcpp::export]]
+DataFrame countSnpInRunsCpp(
+    DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile) {
+
+  // get columns as vectors
+  IntegerVector POSITIONS = mapChrom["POSITION"];
+  CharacterVector SNP_NAME = mapChrom["SNP_NAME"];
+  CharacterVector CHR = mapChrom["CHR"];
+
+  // get unique breeds
+  CharacterVector population = runsChrom["POPULATION"];
+  std::vector<std::string> unique_breeds = as<std::vector<std::string> >(unique(population));
+
+  // sort unique breeds
+  std::sort(unique_breeds.begin(), unique_breeds.end());
+
+  // declare others variables
+  std::string ras;
+  int pos, index, map_size = SNP_NAME.size();
+  std::map <std::string, int> snpCounts, nBreeds ;
+
+  // define result size like n SNPs * unique_breeds
+  int result_size = map_size * unique_breeds.size();
+
+  // the columns of data.frame Defining data types accordingly slinding window
+  CharacterVector snp_name(result_size);
+  CharacterVector chrom(result_size);
+  IntegerVector position(result_size);
+  IntegerVector count(result_size);
+  CharacterVector breed(result_size);
+  NumericVector percentage(result_size);
+  IntegerVector number(result_size);
+
+  // get all populations
+  DataFrame pops = readPOPCpp(genotypeFile);
+  CharacterVector pop = pops["POP"];
+
+  // instantiate a Runs object
+  Runs runs(runsChrom);
+
+  // debug: dump object
+  // runs.dumpRuns();
+
+  // cicle among single breeds and find breed numbers
+  for (unsigned int i=0; i<unique_breeds.size(); i++) {
+    ras = unique_breeds[i];
+
+    // get total if individuals by breed
+    nBreeds[ras] = std::count(pop.begin(), pop.end(), ras.c_str());
+    // Rcout << "N. of animals of Population " << ras << ": " << nBreeds[ras] << std::endl;
+  }
+
+  // iterate over position. Update single values
+  for (unsigned int j=0; j<POSITIONS.size(); j++) {
+    // get a snp position
+    pos = POSITIONS[j];
+
+    // update counts
+    snpCounts = runs.countSnpbyBreed(pos, unique_breeds);
+
+    // update results by breed
+    for (unsigned int i=0; i<unique_breeds.size(); i++) {
+      // get a breed
+      ras = unique_breeds[i];
+
+      //calculating results index
+      index = i * map_size + j;
+
+      // update values
+      number[index] = j+1;
+      count[index] = snpCounts[ras];
+      breed[index] = ras;
+      percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100;
+
+      // debug
+      // if (SNP_NAME[j] == "OAR24_6970428.1") {
+      //   Rcout << "Index i: " << i << " Index j: " << j << " Final index: " << index;
+      //   Rcout << " Breed: " << ras << " Count: " << snpCounts[ras] << std::endl;
+      // }
+
+      // read data from mapChrom
+      snp_name[index] = SNP_NAME[j];
+      chrom[index] = CHR[j];
+      position[index] = pos;
+
+    } // cicle for snp position
+
+  } // cicle for breed
+
+  // initialize dataframe of results.
+  DataFrame res = DataFrame::create(
+    Named("SNP_NAME") = snp_name,
+    Named("CHR") = chrom,
+    Named("POSITION") = position,
+    Named("COUNT") = count,
+    Named("BREED") = fast_factor(breed), //returns a factor
+    Named("PERCENTAGE") = percentage,
+    Named("Number") = number,
+    _["stringsAsFactors"] = false);
+
+  // returning all runs for this individual genotype
+  return(res);
+}
+
+
 DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
   CharacterVector population = runs[0];
   CharacterVector ind = runs[1];
@@ -1232,17 +1353,13 @@ DataFrame tableRunsCpp(
     }
 
     // calculate snpInsideRuns
-    DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
+    DataFrame snpInsideRuns = countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile);
 
     if (snpInsideRuns.nrows() == 0) {
       REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str());
       continue;
     }
 
-    // add a column with the row names as integer
-    IntegerVector Number = seq(1, snpInsideRuns.nrows());
-    snpInsideRuns.push_back(Number, "Number");
-
     // now filter snpInsideRuns relying on threshold
     snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
 
@@ -1285,7 +1402,7 @@ DataFrame tableRunsCpp(
       // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
       // Rprintf("sum_pct: %f\n", sum_pct);
 
-      for (unsigned int x=1; x<group_subset.nrows(); x++) {
+      for (int x=1; x<group_subset.nrows(); x++) {
         // Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
         // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
 

From ef971b6a410f44e928d736062d292bed17a99f86 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 15:27:09 +0200
Subject: [PATCH 16/23] :zap: no more pre-allocate memory in countSnpInRunsCpp

---
 detectRUNS/DESCRIPTION                 |  2 +-
 detectRUNS/R/RcppExports.R             |  5 +-
 detectRUNS/R/Stats.R                   |  8 ++-
 detectRUNS/man/countSnpInRunsCpp.Rd    |  4 +-
 detectRUNS/src/RcppExports.cpp         |  9 ++--
 detectRUNS/src/functions.cpp           | 69 +++++++++++---------------
 detectRUNS/tests/testthat/test_Stats.R |  1 -
 7 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index e4c980d..1099bda 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,7 +2,7 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9002
+Version: 0.9.6.9003
 Date: 2022-04-04
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index 03466ec..684a2f4 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -204,6 +204,7 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
 #' @param runsChrom R object (dataframe) with results per chromosome
 #' @param mapChrom R map object with SNP per chromosome
 #' @param genotypeFile genotype (.ped) file location
+#' @param threshold filter out SNPs below this value
 #'
 #' @return dataframe with counts per SNP in runs (per population)
 #'
@@ -212,8 +213,8 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
 #' @useDynLib detectRUNS
 #' @importFrom Rcpp sourceCpp
 #'
-countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
-    .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
+countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile, threshold = 50) {
+    .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile, threshold)
 }
 
 #' Function to retrieve most common runs in the population
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 9e740ce..626e604 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -469,11 +469,9 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
     runsChrom <- runs[runs$CHROMOSOME == chrom, ]
     mapKrom <- mappa[mappa$CHR == chrom, ]
 
-    # calculate snpInsideRuns
-    snpInsideRuns <- countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile)
-
-    # filter by threshold once
-    snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
+    # calculate snpInsideRuns with threshold
+    snpInsideRuns <- countSnpInRunsCpp(
+      runsChrom, mapKrom, genotypeFile, threshold_used)
 
     for (grp in group_list) {
       # create subset for group
diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd
index 56780ed..f48c341 100644
--- a/detectRUNS/man/countSnpInRunsCpp.Rd
+++ b/detectRUNS/man/countSnpInRunsCpp.Rd
@@ -4,7 +4,7 @@
 \alias{countSnpInRunsCpp}
 \title{Function to count number of times a SNP is in a RUN}
 \usage{
-countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile)
+countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold = 50)
 }
 \arguments{
 \item{runsChrom}{R object (dataframe) with results per chromosome}
@@ -12,6 +12,8 @@ countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile)
 \item{mapChrom}{R map object with SNP per chromosome}
 
 \item{genotypeFile}{genotype (.ped) file location}
+
+\item{threshold}{filter out SNPs below this value}
 }
 \value{
 dataframe with counts per SNP in runs (per population)
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index b4252f5..14f87ea 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -160,15 +160,16 @@ BEGIN_RCPP
 END_RCPP
 }
 // countSnpInRunsCpp
-DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile);
-RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) {
+DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, float threshold);
+RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP, SEXP thresholdSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP);
     Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP);
     Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
-    rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile));
+    Rcpp::traits::input_parameter< float >::type threshold(thresholdSEXP);
+    rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -199,7 +200,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1},
     {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9},
     {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3},
-    {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 3},
+    {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 4},
     {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4},
     {NULL, NULL, 0}
 };
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 62960fe..a9fffae 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1023,6 +1023,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
 //' @param runsChrom R object (dataframe) with results per chromosome
 //' @param mapChrom R map object with SNP per chromosome
 //' @param genotypeFile genotype (.ped) file location
+//' @param threshold filter out SNPs below this value
 //'
 //' @return dataframe with counts per SNP in runs (per population)
 //'
@@ -1033,7 +1034,8 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
 //'
 // [[Rcpp::export]]
 DataFrame countSnpInRunsCpp(
-    DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile) {
+    DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile,
+    float threshold = 50) {
 
   // get columns as vectors
   IntegerVector POSITIONS = mapChrom["POSITION"];
@@ -1049,20 +1051,17 @@ DataFrame countSnpInRunsCpp(
 
   // declare others variables
   std::string ras;
-  int pos, index, map_size = SNP_NAME.size();
+  int pos; float avg_perc;
   std::map <std::string, int> snpCounts, nBreeds ;
 
-  // define result size like n SNPs * unique_breeds
-  int result_size = map_size * unique_breeds.size();
-
   // the columns of data.frame Defining data types accordingly slinding window
-  CharacterVector snp_name(result_size);
-  CharacterVector chrom(result_size);
-  IntegerVector position(result_size);
-  IntegerVector count(result_size);
-  CharacterVector breed(result_size);
-  NumericVector percentage(result_size);
-  IntegerVector number(result_size);
+  CharacterVector snp_name;
+  CharacterVector chrom;
+  IntegerVector position;
+  IntegerVector count;
+  CharacterVector breed;
+  NumericVector percentage;
+  IntegerVector number;
 
   // get all populations
   DataFrame pops = readPOPCpp(genotypeFile);
@@ -1096,25 +1095,21 @@ DataFrame countSnpInRunsCpp(
       // get a breed
       ras = unique_breeds[i];
 
-      //calculating results index
-      index = i * map_size + j;
+      // calculate percentage
+      avg_perc = double(snpCounts[ras])/nBreeds[ras]*100;
 
-      // update values
-      number[index] = j+1;
-      count[index] = snpCounts[ras];
-      breed[index] = ras;
-      percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100;
+      if (avg_perc >= threshold) {
+        // track SNP
+        number.push_back(j+1);
+        count.push_back(snpCounts[ras]);
+        breed.push_back(ras);
+        percentage.push_back(avg_perc);
 
-      // debug
-      // if (SNP_NAME[j] == "OAR24_6970428.1") {
-      //   Rcout << "Index i: " << i << " Index j: " << j << " Final index: " << index;
-      //   Rcout << " Breed: " << ras << " Count: " << snpCounts[ras] << std::endl;
-      // }
-
-      // read data from mapChrom
-      snp_name[index] = SNP_NAME[j];
-      chrom[index] = CHR[j];
-      position[index] = pos;
+        // read data from mapChrom
+        snp_name.push_back(SNP_NAME[j]);
+        chrom.push_back(CHR[j]);
+        position.push_back(pos);
+      }
 
     } // cicle for snp position
 
@@ -1353,20 +1348,14 @@ DataFrame tableRunsCpp(
     }
 
     // calculate snpInsideRuns
-    DataFrame snpInsideRuns = countSnpInRunsCpp(runsChrom, mapKrom, genotypeFile);
-
-    if (snpInsideRuns.nrows() == 0) {
-      REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str());
-      continue;
-    }
-
-    // now filter snpInsideRuns relying on threshold
-    snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
+    DataFrame snpInsideRuns = countSnpInRunsCpp(
+      runsChrom, mapKrom, genotypeFile, threshold_used);
 
     if (snpInsideRuns.nrows() == 0) {
       REprintf(
-        "No SNPs in runs after filtering %f for chromosome %s\n",
-        threshold_used,  chrom.c_str());
+        "No SNPs in runs for chromosome %s with threshold %f\n",
+        chrom.c_str(),
+        threshold);
       continue;
     }
 
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index fe6a2b6..0ec0fdc 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -14,7 +14,6 @@ test_that("Test tableRuns", {
     "integer", "integer", "numeric"
   )
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
-  levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
 
   runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)

From 8d081e3a86a6ba72b2a414fddae5110b7350d18a Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 15:44:36 +0200
Subject: [PATCH 17/23] :rewind: revert countSnpInRunsCpp related code

---
 detectRUNS/DESCRIPTION                 |   2 +-
 detectRUNS/R/RcppExports.R             |  21 ----
 detectRUNS/R/Stats.R                   |  11 +-
 detectRUNS/man/countSnpInRunsCpp.Rd    |  24 -----
 detectRUNS/src/RcppExports.cpp         |  15 ---
 detectRUNS/src/functions.cpp           | 138 +++----------------------
 detectRUNS/tests/testthat/test_Stats.R |   1 +
 7 files changed, 26 insertions(+), 186 deletions(-)
 delete mode 100644 detectRUNS/man/countSnpInRunsCpp.Rd

diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index 1099bda..e4c980d 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,7 +2,7 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9003
+Version: 0.9.6.9002
 Date: 2022-04-04
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index 684a2f4..946d400 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -196,27 +196,6 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
     .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
 }
 
-#' Function to count number of times a SNP is in a RUN
-#'
-#' Similar to snpInsideRunsCpp, it apply additional column to dataframe.
-#' Require to ensure backward compatibility.
-#'
-#' @param runsChrom R object (dataframe) with results per chromosome
-#' @param mapChrom R map object with SNP per chromosome
-#' @param genotypeFile genotype (.ped) file location
-#' @param threshold filter out SNPs below this value
-#'
-#' @return dataframe with counts per SNP in runs (per population)
-#'
-#' @import utils
-#'
-#' @useDynLib detectRUNS
-#' @importFrom Rcpp sourceCpp
-#'
-countSnpInRunsCpp <- function(runsChrom, mapChrom, genotypeFile, threshold = 50) {
-    .Call('_detectRUNS_countSnpInRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile, threshold)
-}
-
 #' Function to retrieve most common runs in the population
 #'
 #' This function takes in input either the run results and returns a subset of
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 626e604..0ae1597 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -469,9 +469,14 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
     runsChrom <- runs[runs$CHROMOSOME == chrom, ]
     mapKrom <- mappa[mappa$CHR == chrom, ]
 
-    # calculate snpInsideRuns with threshold
-    snpInsideRuns <- countSnpInRunsCpp(
-      runsChrom, mapKrom, genotypeFile, threshold_used)
+    # calculate snpInsideRuns
+    snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile)
+
+    # add a column with the row names as integer
+    snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns))
+
+    # filter by threshold once
+    snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
 
     for (grp in group_list) {
       # create subset for group
diff --git a/detectRUNS/man/countSnpInRunsCpp.Rd b/detectRUNS/man/countSnpInRunsCpp.Rd
deleted file mode 100644
index f48c341..0000000
--- a/detectRUNS/man/countSnpInRunsCpp.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/RcppExports.R
-\name{countSnpInRunsCpp}
-\alias{countSnpInRunsCpp}
-\title{Function to count number of times a SNP is in a RUN}
-\usage{
-countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold = 50)
-}
-\arguments{
-\item{runsChrom}{R object (dataframe) with results per chromosome}
-
-\item{mapChrom}{R map object with SNP per chromosome}
-
-\item{genotypeFile}{genotype (.ped) file location}
-
-\item{threshold}{filter out SNPs below this value}
-}
-\value{
-dataframe with counts per SNP in runs (per population)
-}
-\description{
-Similar to snpInsideRunsCpp, it apply additional column to dataframe.
-Require to ensure backward compatibility.
-}
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index 14f87ea..862949d 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -159,20 +159,6 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// countSnpInRunsCpp
-DataFrame countSnpInRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile, float threshold);
-RcppExport SEXP _detectRUNS_countSnpInRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP, SEXP thresholdSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP);
-    Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP);
-    Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
-    Rcpp::traits::input_parameter< float >::type threshold(thresholdSEXP);
-    rcpp_result_gen = Rcpp::wrap(countSnpInRunsCpp(runsChrom, mapChrom, genotypeFile, threshold));
-    return rcpp_result_gen;
-END_RCPP
-}
 // tableRunsCpp
 DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold);
 RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) {
@@ -200,7 +186,6 @@ static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1},
     {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9},
     {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3},
-    {"_detectRUNS_countSnpInRunsCpp", (DL_FUNC) &_detectRUNS_countSnpInRunsCpp, 4},
     {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4},
     {NULL, NULL, 0}
 };
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index a9fffae..aa1a401 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1015,122 +1015,6 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
 }
 
 
-//' Function to count number of times a SNP is in a RUN
-//'
-//' Similar to snpInsideRunsCpp, it apply additional column to dataframe.
-//' Require to ensure backward compatibility.
-//'
-//' @param runsChrom R object (dataframe) with results per chromosome
-//' @param mapChrom R map object with SNP per chromosome
-//' @param genotypeFile genotype (.ped) file location
-//' @param threshold filter out SNPs below this value
-//'
-//' @return dataframe with counts per SNP in runs (per population)
-//'
-//' @import utils
-//'
-//' @useDynLib detectRUNS
-//' @importFrom Rcpp sourceCpp
-//'
-// [[Rcpp::export]]
-DataFrame countSnpInRunsCpp(
-    DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile,
-    float threshold = 50) {
-
-  // get columns as vectors
-  IntegerVector POSITIONS = mapChrom["POSITION"];
-  CharacterVector SNP_NAME = mapChrom["SNP_NAME"];
-  CharacterVector CHR = mapChrom["CHR"];
-
-  // get unique breeds
-  CharacterVector population = runsChrom["POPULATION"];
-  std::vector<std::string> unique_breeds = as<std::vector<std::string> >(unique(population));
-
-  // sort unique breeds
-  std::sort(unique_breeds.begin(), unique_breeds.end());
-
-  // declare others variables
-  std::string ras;
-  int pos; float avg_perc;
-  std::map <std::string, int> snpCounts, nBreeds ;
-
-  // the columns of data.frame Defining data types accordingly slinding window
-  CharacterVector snp_name;
-  CharacterVector chrom;
-  IntegerVector position;
-  IntegerVector count;
-  CharacterVector breed;
-  NumericVector percentage;
-  IntegerVector number;
-
-  // get all populations
-  DataFrame pops = readPOPCpp(genotypeFile);
-  CharacterVector pop = pops["POP"];
-
-  // instantiate a Runs object
-  Runs runs(runsChrom);
-
-  // debug: dump object
-  // runs.dumpRuns();
-
-  // cicle among single breeds and find breed numbers
-  for (unsigned int i=0; i<unique_breeds.size(); i++) {
-    ras = unique_breeds[i];
-
-    // get total if individuals by breed
-    nBreeds[ras] = std::count(pop.begin(), pop.end(), ras.c_str());
-    // Rcout << "N. of animals of Population " << ras << ": " << nBreeds[ras] << std::endl;
-  }
-
-  // iterate over position. Update single values
-  for (unsigned int j=0; j<POSITIONS.size(); j++) {
-    // get a snp position
-    pos = POSITIONS[j];
-
-    // update counts
-    snpCounts = runs.countSnpbyBreed(pos, unique_breeds);
-
-    // update results by breed
-    for (unsigned int i=0; i<unique_breeds.size(); i++) {
-      // get a breed
-      ras = unique_breeds[i];
-
-      // calculate percentage
-      avg_perc = double(snpCounts[ras])/nBreeds[ras]*100;
-
-      if (avg_perc >= threshold) {
-        // track SNP
-        number.push_back(j+1);
-        count.push_back(snpCounts[ras]);
-        breed.push_back(ras);
-        percentage.push_back(avg_perc);
-
-        // read data from mapChrom
-        snp_name.push_back(SNP_NAME[j]);
-        chrom.push_back(CHR[j]);
-        position.push_back(pos);
-      }
-
-    } // cicle for snp position
-
-  } // cicle for breed
-
-  // initialize dataframe of results.
-  DataFrame res = DataFrame::create(
-    Named("SNP_NAME") = snp_name,
-    Named("CHR") = chrom,
-    Named("POSITION") = position,
-    Named("COUNT") = count,
-    Named("BREED") = fast_factor(breed), //returns a factor
-    Named("PERCENTAGE") = percentage,
-    Named("Number") = number,
-    _["stringsAsFactors"] = false);
-
-  // returning all runs for this individual genotype
-  return(res);
-}
-
-
 DataFrame subset_runs_by_chrom(DataFrame runs, std::string chrom){
   CharacterVector population = runs[0];
   CharacterVector ind = runs[1];
@@ -1348,14 +1232,24 @@ DataFrame tableRunsCpp(
     }
 
     // calculate snpInsideRuns
-    DataFrame snpInsideRuns = countSnpInRunsCpp(
-      runsChrom, mapKrom, genotypeFile, threshold_used);
+    DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
+
+    if (snpInsideRuns.nrows() == 0) {
+      REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str());
+      continue;
+    }
+
+    // add a column with the row names as integer
+    IntegerVector Number = seq(1, snpInsideRuns.nrows());
+    snpInsideRuns.push_back(Number, "Number");
+
+    // now filter snpInsideRuns relying on threshold
+    snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
 
     if (snpInsideRuns.nrows() == 0) {
       REprintf(
-        "No SNPs in runs for chromosome %s with threshold %f\n",
-        chrom.c_str(),
-        threshold);
+        "No SNPs in runs after filtering %f for chromosome %s\n",
+        threshold_used,  chrom.c_str());
       continue;
     }
 
@@ -1391,7 +1285,7 @@ DataFrame tableRunsCpp(
       // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
       // Rprintf("sum_pct: %f\n", sum_pct);
 
-      for (int x=1; x<group_subset.nrows(); x++) {
+      for (unsigned int x=1; x<group_subset.nrows(); x++) {
         // Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
         // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
 
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index 0ec0fdc..fe6a2b6 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -14,6 +14,7 @@ test_that("Test tableRuns", {
     "integer", "integer", "numeric"
   )
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
+  levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
 
   runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)

From 43614e6a6b4fbe9901d80d41d1bed97b6a8cb895 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 16:14:08 +0200
Subject: [PATCH 18/23] :recycle: add Number column to snpInsideRunsCpp result

---
 detectRUNS/DESCRIPTION                     |  4 +--
 detectRUNS/R/Stats.R                       |  7 ++---
 detectRUNS/R/plots.R                       |  8 ++++++
 detectRUNS/src/functions.cpp               | 19 ++++++++------
 detectRUNS/tests/testthat/test_functions.R | 30 +++++++++++++---------
 5 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index e4c980d..efcfced 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,8 +2,8 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9002
-Date: 2022-04-04
+Version: 0.9.6.9003
+Date: 2022-04-05
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
     person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 0ae1597..d14da1c 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -379,6 +379,10 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
       runsChrom <- runs[runs$CHROMOSOME==chrom,]
       mapKrom <- mappa[mappa$CHR==chrom,]
       snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
+
+      # remove Number column
+      snpInRuns$Number <- NULL
+
       all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
       n=n+1
       setTxtProgressBar(pb, n)
@@ -472,9 +476,6 @@ tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
     # calculate snpInsideRuns
     snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile)
 
-    # add a column with the row names as integer
-    snpInsideRuns$Number <- as.integer(row.names(snpInsideRuns))
-
     # filter by threshold once
     snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
 
diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R
index 0ac2db9..0b865c4 100644
--- a/detectRUNS/R/plots.R
+++ b/detectRUNS/R/plots.R
@@ -323,6 +323,10 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa
     print(paste("N.of SNP is",nrow(mapChrom)))
 
     snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
+
+    # remove Number column
+    snpInRuns$Number <- NULL
+
     krom <- subset(snpInRuns,CHR==chromosome)
 
     p <- ggplot(data=krom, aes(x=POSITION/(10^6), y=PERCENTAGE, colour=BREED))
@@ -433,6 +437,10 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33,
     runsChrom <- runs[runs$CHROMOSOME==chrom,]
     mapChrom <- mappa[mappa$CHR==chrom,]
     snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile)
+
+    # remove Number column
+    snpInRuns$Number <- NULL
+
     all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
     n=n+1
     setTxtProgressBar(pb, n)
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index aa1a401..554c6d9 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -948,6 +948,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
   IntegerVector count(result_size);
   CharacterVector breed(result_size);
   NumericVector percentage(result_size);
+  IntegerVector number(result_size);
 
   // get all populations
   DataFrame pops = readPOPCpp(genotypeFile);
@@ -988,6 +989,7 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
       count[index] = snpCounts[ras];
       breed[index] = ras;
       percentage[index] = double(snpCounts[ras])/nBreeds[ras]*100;
+      number[index] = j+1;
 
       // debug
       // if (SNP_NAME[j] == "OAR24_6970428.1") {
@@ -1006,9 +1008,14 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
 
   // initialize dataframe of results.
   DataFrame res = DataFrame::create(
-    Named("SNP_NAME")=snp_name, Named("CHR")=chrom, Named("POSITION")=position,
-    Named("COUNT")=count, Named("BREED")=fast_factor(breed)  , //returns a factor
-    Named("PERCENTAGE")=percentage, _["stringsAsFactors"] = false);
+    Named("SNP_NAME") = snp_name,
+    Named("CHR") = chrom,
+    Named("POSITION") = position,
+    Named("COUNT") = count,
+    Named("BREED") = fast_factor(breed), //returns a factor
+    Named("PERCENTAGE") = percentage,
+    Named("Number") = number,
+    _["stringsAsFactors"] = false);
 
   // returning all runs for this individual genotype
   return(res);
@@ -1239,10 +1246,6 @@ DataFrame tableRunsCpp(
       continue;
     }
 
-    // add a column with the row names as integer
-    IntegerVector Number = seq(1, snpInsideRuns.nrows());
-    snpInsideRuns.push_back(Number, "Number");
-
     // now filter snpInsideRuns relying on threshold
     snpInsideRuns = filter_snpInsideRuns_by_threshold(snpInsideRuns, threshold_used);
 
@@ -1285,7 +1288,7 @@ DataFrame tableRunsCpp(
       // Rprintf("Start_SNP: %s, snp_count: %d, ", Start_SNP.get_cstring(), snp_count);
       // Rprintf("sum_pct: %f\n", sum_pct);
 
-      for (unsigned int x=1; x<group_subset.nrows(); x++) {
+      for (int x=1; x<group_subset.nrows(); x++) {
         // Rprintf("Processing: %s, ", as<std::string>(snp_names[x]).c_str());
         // Rprintf("start: %d, perc: %f\n", positions[x], sum_pcts[x]);
 
diff --git a/detectRUNS/tests/testthat/test_functions.R b/detectRUNS/tests/testthat/test_functions.R
index 0b45482..40833b4 100644
--- a/detectRUNS/tests/testthat/test_functions.R
+++ b/detectRUNS/tests/testthat/test_functions.R
@@ -26,10 +26,10 @@ test_that("Test readMapFile", {
   reference <- data.table::fread(mapFile, header = FALSE, colClasses = colClasses)
   names(reference) <- c("CHR","SNP_NAME","x","POSITION")
   reference$x <- NULL
-  
+
   test <- readMapFile(mapFile)
   expect_equal(reference, test)
-  
+
   # test for a non existing file
   expect_error(readMapFile("does_not_exists.map"), "doesn't exists")
 })
@@ -845,7 +845,7 @@ test_that("Testing consecutiveRunsCpp", {
 test_that("Testing snpInsideRuns", {
   # read mapfile with custom methods
   mappa <- readMapFile(mapFile)
-  
+
   # this is the chromosome I want to test
   chrom <- "24"
 
@@ -868,6 +868,9 @@ test_that("Testing snpInsideRuns", {
   reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile)
   test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
 
+  # remove Number column from test
+  test$Number <- NULL
+
   # testing functions
   expect_equivalent(test, reference)
 
@@ -876,35 +879,38 @@ test_that("Testing snpInsideRuns", {
 test_that("Testing snpInsideRuns with CHR as strings", {
   # read mapfile with custom methods
   mappa <- readMapFile(mapFile)
-  
+
   # this is the chromosome I want to test
   chrom <- "X"
-  
+
   # replacing chromsomes with a string value (which cannot be converted as int)
   mappa$CHR <- chrom
-  
+
   # subsetting mapChrom (get x random snps from mapfile)
   set.seed(42)
   mapChrom <- mappa[mappa$CHR==chrom, ]
   mapChrom <- mapChrom[sort(sample(nrow(mapChrom), 10)), ]
-  
+
   # loading pre-calculated data
   runsFile <- "test.ROHet.sliding.csv"
   colClasses <- c(rep("character", 3), rep("numeric", 4)  )
   runs <- read.csv2(runsFile, header = TRUE, stringsAsFactors = FALSE,
                     colClasses = colClasses)
-  
+
   # replacing chromsomes with a string value (which cannot be converted as int)
   runs$chrom <- chrom
-  
+
   # fix column names and define runsChrom
   names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
   runsChrom <- runs[runs$CHROMOSOME==chrom, ]
-  
+
   # get snps inside runs
   reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile)
   test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
-  
+
+  # remove Number column from test
+  test$Number <- NULL
+
   # testing functions
   expect_equivalent(test, reference)
-})
\ No newline at end of file
+})

From ab4d958377e9ab9fc27db7346095811f3ccf433a Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Tue, 5 Apr 2022 16:45:15 +0200
Subject: [PATCH 19/23] :fire: remove tableRuns form R code

---
 detectRUNS/NAMESPACE                   |   1 -
 detectRUNS/R/RcppExports.R             |   6 +-
 detectRUNS/R/Stats.R                   | 169 -------------------------
 detectRUNS/man/tableRuns.Rd            |   8 +-
 detectRUNS/man/tableRunsCpp.Rd         |  52 --------
 detectRUNS/src/RcppExports.cpp         |  10 +-
 detectRUNS/src/functions.cpp           |   4 +-
 detectRUNS/tests/testthat/test_Stats.R |  14 --
 8 files changed, 15 insertions(+), 249 deletions(-)
 delete mode 100644 detectRUNS/man/tableRunsCpp.Rd

diff --git a/detectRUNS/NAMESPACE b/detectRUNS/NAMESPACE
index 11039e4..183f836 100644
--- a/detectRUNS/NAMESPACE
+++ b/detectRUNS/NAMESPACE
@@ -16,7 +16,6 @@ export(reorderDF)
 export(slidingRUNS.run)
 export(summaryRuns)
 export(tableRuns)
-export(tableRunsCpp)
 import(ggplot2)
 import(itertools)
 import(plyr)
diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index 946d400..ecdf88d 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -232,14 +232,14 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
 #' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
 #' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
 #'
-#' table <- tableRunsCpp(
+#' table <- tableRuns(
 #'   runs = runsDF, genotypeFile = genotypeFile,
 #'   mapFile = mapFile, threshold = 0.5)
 #'
 #' @useDynLib detectRUNS
 #' @importFrom Rcpp sourceCpp
 #'
-tableRunsCpp <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
-    .Call('_detectRUNS_tableRunsCpp', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
+tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
+    .Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
 }
 
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index d14da1c..343df49 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -397,172 +397,3 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
 
   return(result_summary)
 }
-
-
-#' Function to retrieve most common runs in the population
-#'
-#' This function takes in input either the run results and returns a subset of
-#' the runs most commonly found in the group/population. The parameter
-#' \code{threshold} controls the definition
-#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
-#'
-#' @param genotypeFile Plink ped file (for SNP position)
-#' @param mapFile Plink map file (for SNP position)
-#' @param runs R object (dataframe) with results on detected runs
-#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
-#' proportion of individuals carrying that run (e.g. 70\%)
-#'
-#' @return A dataframe with the most common runs detected in the sampled individuals
-#' (the group/population, start and end position of the run, chromosome, number of SNP
-#' included in the run and average percentage of SNPs in run
-#' are reported in the output dataframe)
-#' @export
-#'
-#' @examples
-#' # getting map and ped paths
-#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
-#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
-#'
-#' # calculating runs of Homozygosity
-#' \dontrun{
-#' # skipping runs calculation
-#' runs <- slidingRUNS.run(genotypeFile, mapFile,
-#'   windowSize = 15, threshold = 0.1, minSNP = 15,
-#'   ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
-#' )
-#' }
-#' # loading pre-calculated data
-#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
-#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
-#'
-#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
-#'
-tableRuns <- function(runs = NULL, genotypeFile, mapFile, threshold = 0.5) {
-
-  # set a threshold
-  threshold_used <- threshold * 100
-  message(paste("Threshold used:", threshold_used))
-
-  # read map file
-  mappa <- readMapFile(mapFile)
-
-  # change colnames in runs file
-  names(runs) <- c("POPULATION", "IND", "CHROMOSOME", "COUNT", "START", "END", "LENGTH")
-
-  # vector of breeds
-  group_list <- as.vector(unique(runs$POPULATION))
-
-  # final data frame
-  final_table <- data.frame(
-    "GROUP" = character(0), "Start_SNP" = character(0), "End_SNP" = character(0),
-    "chrom" = character(0), "nSNP" = integer(0), "from" = integer(0), "to" = integer(0), "avg_pct" = numeric(0)
-  )
-
-  # Start calculation % SNP in ROH
-  message("Calculation % SNP in ROH") # FILIPPO
-
-  # create progress bar
-  total <- length(unique(runs$CHROMOSOME))
-  message(paste("Chromosome founds: ", total)) # FILIPPO
-  n <- 0
-  pb <- txtProgressBar(min = 0, max = total, style = 3)
-
-  # SNP in ROH
-  for (chrom in sort(unique(runs$CHROMOSOME))) {
-    # extract from map and runs only the chromosome I need
-    runsChrom <- runs[runs$CHROMOSOME == chrom, ]
-    mapKrom <- mappa[mappa$CHR == chrom, ]
-
-    # calculate snpInsideRuns
-    snpInsideRuns <- snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile)
-
-    # filter by threshold once
-    snpInsideRuns <- snpInsideRuns[snpInsideRuns$PERCENTAGE >= threshold_used, ]
-
-    for (grp in group_list) {
-      # create subset for group
-      group_subset <- snpInsideRuns[snpInsideRuns$BREED %in% c(grp), ]
-
-      # after filtering, I need to have at least 2 rows or I can't do the following stuff
-      if (nrow(group_subset) < 2) {
-        next
-      }
-
-      # message("Start from: ", paste0(group_subset[1, ], sep = " "))
-
-      # initialize stuff
-      old_idx <- group_subset[1, 7]
-      from <- group_subset[1, 3]
-      Start_SNP <- group_subset[1, 1]
-      snp_count <- 1
-      sum_pct <- group_subset[1, "PERCENTAGE"]
-
-      for (x in 2:nrow(group_subset)) {
-        # message("Processing: ", paste0(group_subset[x, ], sep = " "))
-
-        # get current index
-        new_idx <- group_subset[x, 7]
-
-        # Difference between indexes. This will be 1 in case of adjacent rows
-        diff <- new_idx - old_idx
-
-        if ((diff > 1) | x == nrow(group_subset)) {
-          if (x == nrow(group_subset)) {
-            # message("End of subset")
-            end_SNP <- group_subset[x, 1]
-            TO <- group_subset[x, 3]
-          } else {
-            # message("End of segment")
-            end_SNP <- group_subset[x - 1, 1]
-            TO <- group_subset[x - 1, 3]
-          }
-
-          # throw away row if composed by only one SNP
-          if (snp_count > 1) {
-            final_table <- rbind.data.frame(final_table, final_table = data.frame(
-              "Group" = group_subset[x - 1, 5],
-              "Start_SNP" = Start_SNP,
-              "End_SNP" = end_SNP,
-              "chrom" = group_subset[x - 1, 2],
-              "nSNP" = snp_count,
-              "from" = from,
-              "to" = TO,
-              "avg_pct" = sum_pct
-            ))
-
-            # message("Writing: ", paste0(tail(final_table, n = 1), sep = " "))
-          }
-
-          # reset variable
-          # message("Start a new segment: ", paste0(group_subset[x, ], sep = " "))
-          snp_count <- 1
-          sum_pct <- group_subset[x, "PERCENTAGE"]
-          from <- group_subset[x, 3]
-          Start_SNP <- group_subset[x, 1]
-        } else {
-          # update stuff
-          snp_count <- snp_count + 1
-          sum_pct <- sum_pct + group_subset[x, "PERCENTAGE"]
-
-          # message("Add snp to group: ", paste(snp_count, new_idx, old_idx, sum_pct))
-        }
-
-        # update index
-        old_idx <- new_idx
-      }
-    }
-
-    n <- n + 1
-    setTxtProgressBar(pb, n)
-  }
-
-  close(pb)
-
-  message("Calculation % SNP in ROH finish") # FILIPPO
-
-  final_table$avg_pct <- final_table$avg_pct / final_table$nSNP
-
-  row.names(final_table) <- seq(1, nrow(final_table))
-
-  return(final_table)
-}
diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd
index 39e34e1..8c8bdb6 100644
--- a/detectRUNS/man/tableRuns.Rd
+++ b/detectRUNS/man/tableRuns.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Stats.R
+% Please edit documentation in R/RcppExports.R
 \name{tableRuns}
 \alias{tableRuns}
 \title{Function to retrieve most common runs in the population}
 \usage{
-tableRuns(runs = NULL, genotypeFile, mapFile, threshold = 0.5)
+tableRuns(runs, genotypeFile, mapFile, threshold = 0.5)
 }
 \arguments{
 \item{runs}{R object (dataframe) with results on detected runs}
@@ -45,6 +45,8 @@ runs <- slidingRUNS.run(genotypeFile, mapFile,
 runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
 runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
 
-tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
+table <- tableRuns(
+  runs = runsDF, genotypeFile = genotypeFile,
+  mapFile = mapFile, threshold = 0.5)
 
 }
diff --git a/detectRUNS/man/tableRunsCpp.Rd b/detectRUNS/man/tableRunsCpp.Rd
deleted file mode 100644
index 7672a3a..0000000
--- a/detectRUNS/man/tableRunsCpp.Rd
+++ /dev/null
@@ -1,52 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/RcppExports.R
-\name{tableRunsCpp}
-\alias{tableRunsCpp}
-\title{Function to retrieve most common runs in the population}
-\usage{
-tableRunsCpp(runs, genotypeFile, mapFile, threshold = 0.5)
-}
-\arguments{
-\item{runs}{R object (dataframe) with results on detected runs}
-
-\item{genotypeFile}{Plink ped file (for SNP position)}
-
-\item{mapFile}{Plink map file (for SNP position)}
-
-\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired
-proportion of individuals carrying that run (e.g. 70\%)}
-}
-\value{
-A dataframe with the most common runs detected in the sampled individuals
-(the group/population, start and end position of the run, chromosome, number of SNP
-included in the run and average percentage of SNPs in run
-are reported in the output dataframe)
-}
-\description{
-This function takes in input either the run results and returns a subset of
-the runs most commonly found in the group/population. The parameter
-\code{threshold} controls the definition
-of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
-}
-\examples{
-# getting map and ped paths
-genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
-mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
-
-# calculating runs of Homozygosity
-\dontrun{
-# skipping runs calculation
-runs <- slidingRUNS.run(genotypeFile, mapFile,
-  windowSize = 15, threshold = 0.1, minSNP = 15,
-  ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
-)
-}
-# loading pre-calculated data
-runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
-runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
-
-table <- tableRunsCpp(
-  runs = runsDF, genotypeFile = genotypeFile,
-  mapFile = mapFile, threshold = 0.5)
-
-}
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index 862949d..e5c9485 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -159,9 +159,9 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// tableRunsCpp
-DataFrame tableRunsCpp(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold);
-RcppExport SEXP _detectRUNS_tableRunsCpp(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) {
+// tableRuns
+DataFrame tableRuns(DataFrame runs, std::string genotypeFile, std::string mapFile, const float threshold);
+RcppExport SEXP _detectRUNS_tableRuns(SEXP runsSEXP, SEXP genotypeFileSEXP, SEXP mapFileSEXP, SEXP thresholdSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -169,7 +169,7 @@ BEGIN_RCPP
     Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
     Rcpp::traits::input_parameter< std::string >::type mapFile(mapFileSEXP);
     Rcpp::traits::input_parameter< const float >::type threshold(thresholdSEXP);
-    rcpp_result_gen = Rcpp::wrap(tableRunsCpp(runs, genotypeFile, mapFile, threshold));
+    rcpp_result_gen = Rcpp::wrap(tableRuns(runs, genotypeFile, mapFile, threshold));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -186,7 +186,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_detectRUNS_readPOPCpp", (DL_FUNC) &_detectRUNS_readPOPCpp, 1},
     {"_detectRUNS_consecutiveRunsCpp", (DL_FUNC) &_detectRUNS_consecutiveRunsCpp, 9},
     {"_detectRUNS_snpInsideRunsCpp", (DL_FUNC) &_detectRUNS_snpInsideRunsCpp, 3},
-    {"_detectRUNS_tableRunsCpp", (DL_FUNC) &_detectRUNS_tableRunsCpp, 4},
+    {"_detectRUNS_tableRuns", (DL_FUNC) &_detectRUNS_tableRuns, 4},
     {NULL, NULL, 0}
 };
 
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 554c6d9..6b2a020 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1174,7 +1174,7 @@ DataFrame filter_snpInsideRuns_by_breed(
 //' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
 //' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
 //'
-//' table <- tableRunsCpp(
+//' table <- tableRuns(
 //'   runs = runsDF, genotypeFile = genotypeFile,
 //'   mapFile = mapFile, threshold = 0.5)
 //'
@@ -1182,7 +1182,7 @@ DataFrame filter_snpInsideRuns_by_breed(
 //' @importFrom Rcpp sourceCpp
 //'
 // [[Rcpp::export]]
-DataFrame tableRunsCpp(
+DataFrame tableRuns(
     DataFrame runs, std::string genotypeFile, std::string mapFile,
     const float threshold = 0.5) {
 
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index fe6a2b6..28d8475 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -14,23 +14,9 @@ test_that("Test tableRuns", {
     "integer", "integer", "numeric"
   )
   reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
-  levels(reference$Group) <- c("Jacobs", "Navajo-Churro")
 
   runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 
   expect_equal(reference, test)
 })
-
-test_that("Test tableRunsCpp", {
-  colClasses <- c(
-    "factor", "character", "character", "character", "numeric",
-    "integer", "integer", "numeric"
-  )
-  reference <- read.csv2("test.tableRuns.csv", colClasses = colClasses)
-
-  runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
-  test <- tableRunsCpp(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
-
-  expect_equal(reference, test)
-})

From 0082c305aa4eb69125855c08cb1bd87f7b143986 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Thu, 21 Apr 2022 15:25:44 +0200
Subject: [PATCH 20/23] :zap: calculate pops outside snpInsideRunsCpp

---
 detectRUNS/DESCRIPTION                     |  4 ++--
 detectRUNS/R/RcppExports.R                 |  6 +++---
 detectRUNS/R/Stats.R                       |  4 +++-
 detectRUNS/R/plots.R                       |  7 +++++--
 detectRUNS/man/snpInsideRunsCpp.Rd         |  4 ++--
 detectRUNS/src/RcppExports.cpp             |  8 ++++----
 detectRUNS/src/functions.cpp               | 12 ++++++++----
 detectRUNS/tests/testthat/test_functions.R |  8 ++++++--
 8 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/detectRUNS/DESCRIPTION b/detectRUNS/DESCRIPTION
index efcfced..90d49ef 100644
--- a/detectRUNS/DESCRIPTION
+++ b/detectRUNS/DESCRIPTION
@@ -2,8 +2,8 @@ Package: detectRUNS
 Type: Package
 Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
     Diploid Genomes
-Version: 0.9.6.9003
-Date: 2022-04-05
+Version: 0.9.6.9004
+Date: 2022-04-21
 Authors@R: c(
     person("Filippo","Biscarini", email="filippo.biscarini@gmail.com", role=c("aut", "cre")),
     person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index ecdf88d..790e01d 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
 #'
 #' @param runsChrom R object (dataframe) with results per chromosome
 #' @param mapChrom R map object with SNP per chromosome
-#' @param genotypeFile genotype (.ped) file location
+#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
 #'
 #' @return dataframe with counts per SNP in runs (per population)
 #'
@@ -192,8 +192,8 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
 #' @useDynLib detectRUNS
 #' @importFrom Rcpp sourceCpp
 #'
-snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
-    .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
+snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
+    .Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops)
 }
 
 #' Function to retrieve most common runs in the population
diff --git a/detectRUNS/R/Stats.R b/detectRUNS/R/Stats.R
index 343df49..49a9ba9 100644
--- a/detectRUNS/R/Stats.R
+++ b/detectRUNS/R/Stats.R
@@ -378,7 +378,9 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
     for (chrom in sort(unique(runs$CHROMOSOME))) {
       runsChrom <- runs[runs$CHROMOSOME==chrom,]
       mapKrom <- mappa[mappa$CHR==chrom,]
-      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
+
+      pops <- readPOPCpp(genotypeFile)
+      snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops)
 
       # remove Number column
       snpInRuns$Number <- NULL
diff --git a/detectRUNS/R/plots.R b/detectRUNS/R/plots.R
index 0b865c4..9b9e03c 100644
--- a/detectRUNS/R/plots.R
+++ b/detectRUNS/R/plots.R
@@ -322,7 +322,8 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa
     mapChrom <- mappa[mappa$CHR==chromosome,]
     print(paste("N.of SNP is",nrow(mapChrom)))
 
-    snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
+    pops <- readPOPCpp(genotypeFile)
+    snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, pops)
 
     # remove Number column
     snpInRuns$Number <- NULL
@@ -436,7 +437,9 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33,
   for (chrom in sort(unique(runs$CHROMOSOME))) {
     runsChrom <- runs[runs$CHROMOSOME==chrom,]
     mapChrom <- mappa[mappa$CHR==chrom,]
-    snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile)
+
+    pops <- readPOPCpp(genotypeFile)
+    snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, pops)
 
     # remove Number column
     snpInRuns$Number <- NULL
diff --git a/detectRUNS/man/snpInsideRunsCpp.Rd b/detectRUNS/man/snpInsideRunsCpp.Rd
index 6e03a4d..e87b10f 100644
--- a/detectRUNS/man/snpInsideRunsCpp.Rd
+++ b/detectRUNS/man/snpInsideRunsCpp.Rd
@@ -4,14 +4,14 @@
 \alias{snpInsideRunsCpp}
 \title{Function to count number of times a SNP is in a RUN}
 \usage{
-snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
+snpInsideRunsCpp(runsChrom, mapChrom, pops)
 }
 \arguments{
 \item{runsChrom}{R object (dataframe) with results per chromosome}
 
 \item{mapChrom}{R map object with SNP per chromosome}
 
-\item{genotypeFile}{genotype (.ped) file location}
+\item{pops}{R object (dataframe) with populations see \code{\link{readPOPCpp}}}
 }
 \value{
 dataframe with counts per SNP in runs (per population)
diff --git a/detectRUNS/src/RcppExports.cpp b/detectRUNS/src/RcppExports.cpp
index e5c9485..d56ba46 100644
--- a/detectRUNS/src/RcppExports.cpp
+++ b/detectRUNS/src/RcppExports.cpp
@@ -147,15 +147,15 @@ BEGIN_RCPP
 END_RCPP
 }
 // snpInsideRunsCpp
-DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, std::string genotypeFile);
-RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP genotypeFileSEXP) {
+DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom, DataFrame pops);
+RcppExport SEXP _detectRUNS_snpInsideRunsCpp(SEXP runsChromSEXP, SEXP mapChromSEXP, SEXP popsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< DataFrame >::type runsChrom(runsChromSEXP);
     Rcpp::traits::input_parameter< DataFrame >::type mapChrom(mapChromSEXP);
-    Rcpp::traits::input_parameter< std::string >::type genotypeFile(genotypeFileSEXP);
-    rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile));
+    Rcpp::traits::input_parameter< DataFrame >::type pops(popsSEXP);
+    rcpp_result_gen = Rcpp::wrap(snpInsideRunsCpp(runsChrom, mapChrom, pops));
     return rcpp_result_gen;
 END_RCPP
 }
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 6b2a020..0a83467 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -908,7 +908,7 @@ void Runs::dumpRuns() {
 //'
 //' @param runsChrom R object (dataframe) with results per chromosome
 //' @param mapChrom R map object with SNP per chromosome
-//' @param genotypeFile genotype (.ped) file location
+//' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
 //'
 //' @return dataframe with counts per SNP in runs (per population)
 //'
@@ -919,7 +919,7 @@ void Runs::dumpRuns() {
 //'
 // [[Rcpp::export]]
 DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
-                           std::string genotypeFile) {
+                           DataFrame pops) {
 
   // get columns as vectors
   IntegerVector POSITIONS = mapChrom["POSITION"];
@@ -951,7 +951,6 @@ DataFrame snpInsideRunsCpp(DataFrame runsChrom, DataFrame mapChrom,
   IntegerVector number(result_size);
 
   // get all populations
-  DataFrame pops = readPOPCpp(genotypeFile);
   CharacterVector pop = pops["POP"];
 
   // instantiate a Runs object
@@ -1199,6 +1198,11 @@ DataFrame tableRuns(
   // read mapfile using R function (which uses data.table::fread)
   DataFrame mappa = readMapFile(mapFile);
 
+  // read populations from genotypeFile
+  Rprintf("Read populations from '%s'...\n", genotypeFile.c_str());
+  DataFrame pops = readPOPCpp(genotypeFile);
+  Rprintf("Done!\n");
+
   // get distinct chromosomes
   CharacterVector chromosomes = runs[2];
   CharacterVector unique_chromosomes = sort_unique(chromosomes);
@@ -1239,7 +1243,7 @@ DataFrame tableRuns(
     }
 
     // calculate snpInsideRuns
-    DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, genotypeFile);
+    DataFrame snpInsideRuns = snpInsideRunsCpp(runsChrom, mapKrom, pops);
 
     if (snpInsideRuns.nrows() == 0) {
       REprintf("No SNPs in runs for chromosome %s\n", chrom.c_str());
diff --git a/detectRUNS/tests/testthat/test_functions.R b/detectRUNS/tests/testthat/test_functions.R
index 40833b4..6e066bb 100644
--- a/detectRUNS/tests/testthat/test_functions.R
+++ b/detectRUNS/tests/testthat/test_functions.R
@@ -866,7 +866,9 @@ test_that("Testing snpInsideRuns", {
 
   # get snps inside runs
   reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile)
-  test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
+
+  pops <- readPOPCpp(genotypeFile)
+  test <- snpInsideRunsCpp(runsChrom, mapChrom, pops)
 
   # remove Number column from test
   test$Number <- NULL
@@ -906,7 +908,9 @@ test_that("Testing snpInsideRuns with CHR as strings", {
 
   # get snps inside runs
   reference <- snpInsideRuns(runsChrom, mapChrom, genotypeFile)
-  test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
+
+  pops <- readPOPCpp(genotypeFile)
+  test <- snpInsideRunsCpp(runsChrom, mapChrom, pops)
 
   # remove Number column from test
   test$Number <- NULL

From 52abad1fd8731b121385e618cc19279d1f77cd61 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Thu, 21 Apr 2022 15:47:59 +0200
Subject: [PATCH 21/23] :poop: attempt to fix performance scripts - not working

---
 performance/test-snpInsideRunsCpp.R | 12 +++++++++++-
 performance/test1.R                 | 20 ++++++++++++++------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/performance/test-snpInsideRunsCpp.R b/performance/test-snpInsideRunsCpp.R
index a59ec19..ea9555a 100644
--- a/performance/test-snpInsideRunsCpp.R
+++ b/performance/test-snpInsideRunsCpp.R
@@ -1,4 +1,9 @@
 
+library(devtools)
+library(here)
+
+load_all(path = here("detectRUNS/"))
+
 # get files path
 mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package="detectRUNS")
 genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS")
@@ -20,7 +25,12 @@ runs <- readExternalRuns(runsfile, program="detectRUNS")
 names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
 runsChrom <- runs[runs$CHROMOSOME==chrom, ]
 
+# loading pops
+pops <- readPOPCpp(genotypeFile)
+
+message("testing snpInsideRunsCpp...")
+
 # get snps inside runs
 for (i in 1:100000) {
-  test <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)  
+  test <- snpInsideRunsCpp(runsChrom, mapChrom, pops)
 }
diff --git a/performance/test1.R b/performance/test1.R
index 5bddf16..4e8ceda 100644
--- a/performance/test1.R
+++ b/performance/test1.R
@@ -1,6 +1,11 @@
 
 # Valuating detectRUNS performance
 
+library(devtools)
+library(here)
+
+load_all(path = here("detectRUNS/"))
+
 # clean up
 rm(list = ls())
 
@@ -75,8 +80,8 @@ steps <- ceiling(seq(1, length(x), length.out = (x_points+1) ))[-1]
 
 # calculating runs of Homozygosity
 runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1,
-                        minSNP = 15, ROHet = FALSE,  maxOppositeGenotype = 1,
-                        maxMiss = 1,  minLengthBps = 100000,  minDensity = 1/10000)
+                        minSNP = 15, ROHet = FALSE,  maxOppWindow = 1,
+                        maxMissWindow = 1,  minLengthBps = 100000,  minDensity = 1/10000)
 
 # fix column names
 names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
@@ -100,11 +105,11 @@ for (i in steps) {
   message(paste("Test sliding window: step", i))
 
   # calculate sliding window
-  y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
+  y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
                      parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss)
 
   test_sliding <- microbenchmark(
-    slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
+    slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
                   parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss),
     unit = 'ms',
     times = times
@@ -118,7 +123,7 @@ for (i in steps) {
 
   # check cpp slidingWindow
   test_slidingCpp <- microbenchmark(
-    slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
+    slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
                      parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss),
     unit = 'ms',
     times = times
@@ -256,9 +261,12 @@ for (i in steps) {
   tmp <- data.frame(fun=test_fun, step=test_step, time=test_snpInsideRuns$time, language=test_language)
   tests <- rbind(tests, tmp)
 
+  # read pops
+  pops <- readPOPCpp(genotypeFile)
+
   # check cpp snpInsideRuns
   test_snpInsideRunsCpp <- microbenchmark(
-    snpInsideRunsCpp(runs, mappa, genotypeFile),
+    snpInsideRunsCpp(runs, mappa, pops),
     unit = 'ms',
     times = times
   )

From 496642c51a5a95c52d810446e9e9760a5964c8b0 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Fri, 22 Apr 2022 11:39:21 +0200
Subject: [PATCH 22/23] :bug: fixed test1 performance test script

---
 performance/test1.R | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/performance/test1.R b/performance/test1.R
index 4e8ceda..1ba0bf1 100644
--- a/performance/test1.R
+++ b/performance/test1.R
@@ -25,12 +25,14 @@ parameters <- list(
   minSNP=5,
   ROHet=TRUE,
   maxOppositeGenotype=1,
+  maxOppWindow=1,
   maxMiss=1,
   maxGap=10^6,
   minLengthBps=1000,
   minDensity=1/10,
-  maxOppRun=NULL,
-  maxMissRun=NULL
+  maxOppRun=1,
+  maxMissRun=1,
+  maxMissWindow=1
 )
 
 # how many times perform test
@@ -57,7 +59,7 @@ animals <- readPOPCpp(genotypeFile = genotypeFile)
 
 # get only one individual. Get index
 # idx <- 1
-idx <- which(animals$ID=="H70")
+idx <- which(animals$ID=="H114")
 
 # get an animal
 animal <- animals[idx, ]
@@ -105,11 +107,11 @@ for (i in steps) {
   message(paste("Test sliding window: step", i))
 
   # calculate sliding window
-  y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
+  y <- slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
                      parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss)
 
   test_sliding <- microbenchmark(
-    slidingWindow(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
+    slidingWindow(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
                   parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss),
     unit = 'ms',
     times = times
@@ -123,7 +125,7 @@ for (i in steps) {
 
   # check cpp slidingWindow
   test_slidingCpp <- microbenchmark(
-    slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=2, parameters$maxGap,
+    slidingWindowCpp(subset_genotype, gaps, parameters$windowSize, step=1, parameters$maxGap,
                      parameters$ROHet, parameters$maxOppositeGenotype, parameters$maxMiss),
     unit = 'ms',
     times = times

From 856c488b5da4e9a6fbbb3dcf56be8d1dadd0dbc1 Mon Sep 17 00:00:00 2001
From: Paolo Cozzi <bunop@libero.it>
Date: Wed, 27 Apr 2022 13:30:21 +0200
Subject: [PATCH 23/23] :bug: check threshold value in tableRun

---
 detectRUNS/R/RcppExports.R             | 4 ++--
 detectRUNS/man/tableRuns.Rd            | 4 ++--
 detectRUNS/src/functions.cpp           | 9 +++++++--
 detectRUNS/tests/testthat/test_Stats.R | 5 +++++
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/detectRUNS/R/RcppExports.R b/detectRUNS/R/RcppExports.R
index 790e01d..6af6306 100644
--- a/detectRUNS/R/RcppExports.R
+++ b/detectRUNS/R/RcppExports.R
@@ -206,8 +206,8 @@ snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
 #' @param genotypeFile Plink ped file (for SNP position)
 #' @param mapFile Plink map file (for SNP position)
 #' @param runs R object (dataframe) with results on detected runs
-#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
-#' proportion of individuals carrying that run (e.g. 70\%)
+#' @param threshold value from 0 to 1 (default 0.5) that controls the desired
+#' proportion of individuals carrying that run (e.g. 50\%)
 #'
 #' @return A dataframe with the most common runs detected in the sampled individuals
 #' (the group/population, start and end position of the run, chromosome, number of SNP
diff --git a/detectRUNS/man/tableRuns.Rd b/detectRUNS/man/tableRuns.Rd
index 8c8bdb6..27f60aa 100644
--- a/detectRUNS/man/tableRuns.Rd
+++ b/detectRUNS/man/tableRuns.Rd
@@ -13,8 +13,8 @@ tableRuns(runs, genotypeFile, mapFile, threshold = 0.5)
 
 \item{mapFile}{Plink map file (for SNP position)}
 
-\item{threshold}{value from 0 to 1 (default 0.7) that controls the desired
-proportion of individuals carrying that run (e.g. 70\%)}
+\item{threshold}{value from 0 to 1 (default 0.5) that controls the desired
+proportion of individuals carrying that run (e.g. 50\%)}
 }
 \value{
 A dataframe with the most common runs detected in the sampled individuals
diff --git a/detectRUNS/src/functions.cpp b/detectRUNS/src/functions.cpp
index 0a83467..73005fa 100644
--- a/detectRUNS/src/functions.cpp
+++ b/detectRUNS/src/functions.cpp
@@ -1147,8 +1147,8 @@ DataFrame filter_snpInsideRuns_by_breed(
 //' @param genotypeFile Plink ped file (for SNP position)
 //' @param mapFile Plink map file (for SNP position)
 //' @param runs R object (dataframe) with results on detected runs
-//' @param threshold value from 0 to 1 (default 0.7) that controls the desired
-//' proportion of individuals carrying that run (e.g. 70\%)
+//' @param threshold value from 0 to 1 (default 0.5) that controls the desired
+//' proportion of individuals carrying that run (e.g. 50\%)
 //'
 //' @return A dataframe with the most common runs detected in the sampled individuals
 //' (the group/population, start and end position of the run, chromosome, number of SNP
@@ -1185,6 +1185,11 @@ DataFrame tableRuns(
     DataFrame runs, std::string genotypeFile, std::string mapFile,
     const float threshold = 0.5) {
 
+  // check for threshold value
+  if (threshold > 1 || threshold < 0) {
+    throw std::range_error("Threshold must be between 0 and 1");
+  }
+
   // set a threshold
   float threshold_used = threshold * 100;
   Rprintf("Threshold used: %f\n", threshold_used);
diff --git a/detectRUNS/tests/testthat/test_Stats.R b/detectRUNS/tests/testthat/test_Stats.R
index 28d8475..7b5d321 100644
--- a/detectRUNS/tests/testthat/test_Stats.R
+++ b/detectRUNS/tests/testthat/test_Stats.R
@@ -19,4 +19,9 @@ test_that("Test tableRuns", {
   test <- tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
 
   expect_equal(reference, test)
+
+  expect_error(
+    tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 50),
+    "Threshold must be between 0 and 1"
+  )
 })