Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔖 release v0.9.7 #28

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
25e8677
updating the function tableRuns
filippob Jan 5, 2022
4c93dc4
updating help of the function tableRuns
filippob Jan 5, 2022
1f821bc
:twisted_rightwards_arrows: Merge branch 'master' into devel
bunop Mar 22, 2022
7c6bd2d
:white_check_mark: test tableRuns
bunop Mar 28, 2022
0362a33
:fire: remove SnpInRuns from tableRuns parameters
bunop Mar 28, 2022
9c74576
:zap: calculate tableRuns by chrom
bunop Mar 28, 2022
99cccaa
:construction_worker: attempt to fix coverage visualization
bunop Mar 29, 2022
0a7a24b
:bug: change tableRUNS threshold
bunop Mar 29, 2022
a6e37d4
:bug: fix bug in tableRuns
bunop Mar 30, 2022
437619e
:bug: fix bug in tableRuns filtering
bunop Mar 30, 2022
ca3d31e
:sparkles: start implementing tableRuns with Rcpp
bunop Apr 4, 2022
7db05eb
:sparkles: implement tableRuns in cpp
bunop Apr 4, 2022
2ca0e5a
:white_check_mark: test tableRunsCpp
bunop Apr 4, 2022
3c7d235
:bug: skip tableRuns when no data available
bunop Apr 4, 2022
c78177a
:recycle: refactor snpInsideRunsCpp internal variables
bunop Apr 4, 2022
677fcad
:recycle: filter tableRuns by threshold before evaluating breeds
bunop Apr 5, 2022
84eb6e6
:monocle_face: evaluate tableRuns performance in Cpp and R
bunop Apr 5, 2022
d05ef4d
:poop: duplicate snpInsideRunsCpp to ensure backward compatibility
bunop Apr 5, 2022
ef971b6
:zap: no more pre-allocate memory in countSnpInRunsCpp
bunop Apr 5, 2022
8d081e3
:rewind: revert countSnpInRunsCpp related code
bunop Apr 5, 2022
43614e6
:recycle: add Number column to snpInsideRunsCpp result
bunop Apr 5, 2022
ab4d958
:fire: remove tableRuns form R code
bunop Apr 5, 2022
0082c30
:zap: calculate pops outside snpInsideRunsCpp
bunop Apr 21, 2022
52abad1
:poop: attempt to fix performance scripts - not working
bunop Apr 21, 2022
496642c
:bug: fixed test1 performance test script
bunop Apr 22, 2022
856c488
:bug: check threshold value in tableRun
bunop Apr 27, 2022
79b04a6
:alien: set guides(fill="none") in plot_InbreedingChr and plot_Distri…
bunop Apr 28, 2023
466f55a
Merge pull request #40 from bioinformatics-ptp/issue-39
filippob May 9, 2023
6503f49
Merge branch 'devel' into issue-31
filippob May 9, 2023
a35f7a3
Merge pull request #32 from bioinformatics-ptp/issue-31
filippob May 9, 2023
567859e
Updated documentation and vignette
filippob May 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,5 @@ jobs:
working-directory: ./detectRUNS

- name: Test coverage
run: covr::codecov()
run: covr::codecov(path="detectRUNS/")
shell: Rscript {0}
working-directory: ./detectRUNS
8 changes: 4 additions & 4 deletions detectRUNS/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ Package: detectRUNS
Type: Package
Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
Diploid Genomes
Version: 0.9.6.9000
Date: 2020-12-20
Version: 0.9.6.9004
Date: 2022-04-21
Authors@R: c(
person("Filippo","Biscarini", email="[email protected]", role=c("aut", "cre")),
person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"),
person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
person("Giustino","Gaspa", email="[email protected]", role="aut"),
person("Gabriele","Marras", email="[email protected]", role="aut")
)
Expand All @@ -28,7 +28,7 @@ Imports:
Rcpp,
gridExtra,
data.table
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
Suggests:
testthat,
knitr,
Expand Down
53 changes: 50 additions & 3 deletions detectRUNS/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#'
#' @param runsChrom R object (dataframe) with results per chromosome
#' @param mapChrom R map object with SNP per chromosome
#' @param genotypeFile genotype (.ped) file location
#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
#'
#' @return dataframe with counts per SNP in runs (per population)
#'
Expand All @@ -192,7 +192,54 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops)
}

#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results and returns a subset of
#' the runs most commonly found in the group/population. The parameter
#' \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.5) that controls the desired
#' proportion of individuals carrying that run (e.g. 50\%)
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome, number of SNP
#' included in the run and average percentage of SNPs in run
#' are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile,
#' windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
#' )
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
#'
#' table <- tableRuns(
#' runs = runsDF, genotypeFile = genotypeFile,
#' mapFile = mapFile, threshold = 0.5)
#'
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
.Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
}

182 changes: 19 additions & 163 deletions detectRUNS/R/Stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
chromosomeLength <- function(mapFile){
# read mapfile
mappa <- readMapFile(mapFile)

maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0

# defining NULL variables to avoid warning messages
Expand Down Expand Up @@ -243,7 +243,16 @@ Froh_inbreedingClass <- function(runs, mapFile, Class=2){
#'
#' @return A list of dataframes containing the most relevant descriptives
#' statistics on detected runs. The list conveniently contains 9 dataframes that can
#' be used for further processing and visualization, or can be written out to text files
#' be used for further processing and visualization, or can be written out to text files:
#' 1) summary_ROH_count_chr: n. of runs per chromosome and breed/group;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This list will be rendered as a plain text. Consider using list with roxygen, for example:

#' \enumerate{
#'  \item{summary_ROH_count_chr: }{n. of runs per chromosome and breed/group}
#'  \item{summary_ROH_percentage_chr: }{percent distribution of runs per chromosome in each breed/group (sum to 1)}
#'  \item{summary_ROH_count: }{n. of runs per size-class (Mb) in each breed/group}
#' <add all the other items>
#' }

#' 2) summary_ROH_percentage_chr: percent distribution of runs per chromosome in each breed/group (sum to 1)
#' 3) summary_ROH_count: n. of runs per size-class (Mb) in each breed/group
#' 4) summary_ROH_percentage: percent distribution of runs per size-class (Mb) in each breed/group (sum to 1)
#' 5) summary_ROH_mean_chr: average size of runs (Mb) per chromosome and breed/group
#' 6) summary_ROH_mean_class: average size of runs (Mb) per size-class (Mb) in each breed/group
#' 7) result_Froh_genome_wide: genome-wide inbreeding ($F_{ROH}$) for each individual in the dataset
#' 8) result_Froh_chromosome_wide inbreeding ($F_{ROH}$) for each individual (and chromosome) in the dataset
#' 9) result_Froh_class: genome-wide inbreeding ($F_{ROH}$) for each individual in the dataset per size-class (Mb) of runs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There could be a 10th item when providing snpInRuns=TRUE as parameter

#' @export
#'
#' @examples
Expand Down Expand Up @@ -355,7 +364,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){

message("Calculating SNPs inside ROH")
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")

#read map file
mappa <- readMapFile(mapFile)

Expand All @@ -378,177 +387,24 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
}



return(result_summary)
}


#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results or the output from
#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
#' in the population/group, and returns a subset of the runs most commonly
#' found in the group/population. The parameter \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
#' proportion of individuals carrying that run (e.g. 70\%)
#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a
#' run in the population (output from \code{snpInsideRuns})
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome and number of SNP
#' included in the run are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000)
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
#'
#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
#'

tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) {

#set a threshold
threshold_used=threshold*100
message(paste('Threshold used:',threshold_used))

# read map file
mappa <- readMapFile(mapFile)

if(!is.null(runs) & is.null(SnpInRuns)){
message('I found only Runs data frame. GOOD!')
pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops)

#change colnames in runs file
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")
# remove Number column
snpInRuns$Number <- NULL

#Start calculation % SNP in ROH
message("Calculation % SNP in ROH") #FILIPPO
all_SNPinROH <- data.frame("SNP_NAME"=character(),
"CHR"=integer(),
"POSITION"=numeric(),
"COUNT"=integer(),
"BREED"=factor(),
"PERCENTAGE"=numeric(),
stringsAsFactors=FALSE)

# create progress bar
total <- length(unique(runs$CHROMOSOME))
message(paste('Chromosome founds: ',total)) #FILIPPO
n=0
pb <- txtProgressBar(min = 0, max = total, style = 3)

#SNP in ROH
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
} else if (is.null(runs) & !is.null(SnpInRuns)) {
message('I found only SNPinRuns data frame. GOOD!')
all_SNPinROH=SnpInRuns
} else{
stop('You gave me Runs and SNPinRuns! Please choose one!')
}

#consecutive number
all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))

#final data frame
final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0),
"chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0))


#vector of breeds
group_list=as.vector(unique(all_SNPinROH$BREED))

for (grp in group_list){
message(paste('checking: ',grp))

#create subset for group/thresold
group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,])

#print(group_subset)

#variable
old_pos=group_subset[1,7]
snp_pos1=group_subset[1,3]
Start_SNP=group_subset[1,1]
snp_count=0

x=2
while(x <= length(rownames(group_subset))) {

snp_count = snp_count + 1
new_pos=group_subset[x,7]
old_pos=group_subset[x-1,7]
chr_old=group_subset[x-1,2]
chr_new =group_subset[x,2]

diff=new_pos-old_pos

if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) {
if (x==length(rownames(group_subset))){
end_SNP=group_subset[x,1]
TO=group_subset[x,3]
}else{
end_SNP=group_subset[x-1,1]
TO=group_subset[x-1,3]
}

final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5],
"Start_SNP"=Start_SNP,
"End_SNP"=end_SNP,
"chrom"=group_subset[x-1,2],
"nSNP"=snp_count,
"from"=snp_pos1,
"to"=TO))

#reset variable
snp_count=0
snp_pos1=group_subset[x,3]
Start_SNP=group_subset[x,1]
}

#upgrade x value
x <- x+1

}
}

rownames(final_table) <- seq(1,length(row.names(final_table)))
return(final_table)
return(result_summary)
}

Loading