Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡ refactor tableRuns to better manage memory #32

Merged
merged 24 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7c6bd2d
:white_check_mark: test tableRuns
bunop Mar 28, 2022
0362a33
:fire: remove SnpInRuns from tableRuns parameters
bunop Mar 28, 2022
9c74576
:zap: calculate tableRuns by chrom
bunop Mar 28, 2022
99cccaa
:construction_worker: attempt to fix coverage visualization
bunop Mar 29, 2022
0a7a24b
:bug: change tableRUNS threshold
bunop Mar 29, 2022
a6e37d4
:bug: fix bug in tableRuns
bunop Mar 30, 2022
437619e
:bug: fix bug in tableRuns filtering
bunop Mar 30, 2022
ca3d31e
:sparkles: start implementing tableRuns with Rcpp
bunop Apr 4, 2022
7db05eb
:sparkles: implement tableRuns in cpp
bunop Apr 4, 2022
2ca0e5a
:white_check_mark: test tableRunsCpp
bunop Apr 4, 2022
3c7d235
:bug: skip tableRuns when no data available
bunop Apr 4, 2022
c78177a
:recycle: refactor snpInsideRunsCpp internal variables
bunop Apr 4, 2022
677fcad
:recycle: filter tableRuns by threshold before evaluating breeds
bunop Apr 5, 2022
84eb6e6
:monocle_face: evaluate tableRuns performance in Cpp and R
bunop Apr 5, 2022
d05ef4d
:poop: duplicate snpInsideRunsCpp to ensure backward compatibility
bunop Apr 5, 2022
ef971b6
:zap: no more pre-allocate memory in countSnpInRunsCpp
bunop Apr 5, 2022
8d081e3
:rewind: revert countSnpInRunsCpp related code
bunop Apr 5, 2022
43614e6
:recycle: add Number column to snpInsideRunsCpp result
bunop Apr 5, 2022
ab4d958
:fire: remove tableRuns form R code
bunop Apr 5, 2022
0082c30
:zap: calculate pops outside snpInsideRunsCpp
bunop Apr 21, 2022
52abad1
:poop: attempt to fix performance scripts - not working
bunop Apr 21, 2022
496642c
:bug: fixed test1 performance test script
bunop Apr 22, 2022
856c488
:bug: check threshold value in tableRun
bunop Apr 27, 2022
6503f49
Merge branch 'devel' into issue-31
filippob May 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,5 @@ jobs:
working-directory: ./detectRUNS

- name: Test coverage
run: covr::codecov()
run: covr::codecov(path="detectRUNS/")
shell: Rscript {0}
working-directory: ./detectRUNS
6 changes: 3 additions & 3 deletions detectRUNS/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ Package: detectRUNS
Type: Package
Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
Diploid Genomes
Version: 0.9.6.9000
Date: 2020-12-20
Version: 0.9.6.9004
Date: 2022-04-21
Authors@R: c(
person("Filippo","Biscarini", email="[email protected]", role=c("aut", "cre")),
person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"),
person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
person("Giustino","Gaspa", email="[email protected]", role="aut"),
person("Gabriele","Marras", email="[email protected]", role="aut")
)
Expand Down
53 changes: 50 additions & 3 deletions detectRUNS/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#'
#' @param runsChrom R object (dataframe) with results per chromosome
#' @param mapChrom R map object with SNP per chromosome
#' @param genotypeFile genotype (.ped) file location
#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
#'
#' @return dataframe with counts per SNP in runs (per population)
#'
Expand All @@ -192,7 +192,54 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops)
}

#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results and returns a subset of
#' the runs most commonly found in the group/population. The parameter
#' \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.5) that controls the desired
#' proportion of individuals carrying that run (e.g. 50\%)
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome, number of SNP
#' included in the run and average percentage of SNPs in run
#' are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile,
#' windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
#' )
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
#'
#' table <- tableRuns(
#' runs = runsDF, genotypeFile = genotypeFile,
#' mapFile = mapFile, threshold = 0.5)
#'
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
.Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
}

178 changes: 9 additions & 169 deletions detectRUNS/R/Stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
chromosomeLength <- function(mapFile){
# read mapfile
mappa <- readMapFile(mapFile)

maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0

# defining NULL variables to avoid warning messages
Expand Down Expand Up @@ -355,7 +355,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){

message("Calculating SNPs inside ROH")
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")

#read map file
mappa <- readMapFile(mapFile)

Expand All @@ -378,184 +378,24 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
}

pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops)

# remove Number column
snpInRuns$Number <- NULL

return(result_summary)
}


#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results or the output from
#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
#' in the population/group, and returns a subset of the runs most commonly
#' found in the group/population. The parameter \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
#' proportion of individuals carrying that run (e.g. 70\%)
#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a
#' run in the population (output from \code{snpInsideRuns})
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome, number of SNP
#' included in the run and average percentage of SNPs in run
#' are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000)
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
#'
#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
#'

tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) {

#set a threshold
threshold_used=threshold*100
message(paste('Threshold used:',threshold_used))

# read map file
mappa <- readMapFile(mapFile)

if(!is.null(runs) & is.null(SnpInRuns)){
message('I found only Runs data frame. GOOD!')

#change colnames in runs file
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")

#Start calculation % SNP in ROH
message("Calculation % SNP in ROH") #FILIPPO
all_SNPinROH <- data.frame("SNP_NAME"=character(),
"CHR"=integer(),
"POSITION"=numeric(),
"COUNT"=integer(),
"BREED"=factor(),
"PERCENTAGE"=numeric(),
stringsAsFactors=FALSE)

# create progress bar
total <- length(unique(runs$CHROMOSOME))
message(paste('Chromosome founds: ',total)) #FILIPPO
n=0
pb <- txtProgressBar(min = 0, max = total, style = 3)

#SNP in ROH
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
} else if (is.null(runs) & !is.null(SnpInRuns)) {
message('I found only SNPinRuns data frame. GOOD!')
all_SNPinROH=SnpInRuns
} else{
stop('You gave me Runs and SNPinRuns! Please choose one!')
}

#consecutive number
all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))

#final data frame
final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0),
"chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0), "avg_pct"=numeric(0))


#vector of breeds
group_list=as.vector(unique(all_SNPinROH$BREED))

for (grp in group_list){
message(paste('checking: ',grp))

#create subset for group/thresold
group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,])

#print(group_subset)

#variable
old_pos=group_subset[1,7]
snp_pos1=group_subset[1,3]
Start_SNP=group_subset[1,1]
snp_count=0
sum_pct = 0

x=2
while(x <= length(rownames(group_subset))) {

snp_count = snp_count + 1
new_pos=group_subset[x,7]
old_pos=group_subset[x-1,7]
chr_old=group_subset[x-1,2]
chr_new =group_subset[x,2]
sum_pct = sum_pct + group_subset[x-1,"PERCENTAGE"]

diff=new_pos-old_pos

if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) {
if (x==length(rownames(group_subset))){
end_SNP=group_subset[x,1]
TO=group_subset[x,3]
}else{
end_SNP=group_subset[x-1,1]
TO=group_subset[x-1,3]
}

final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5],
"Start_SNP"=Start_SNP,
"End_SNP"=end_SNP,
"chrom"=group_subset[x-1,2],
"nSNP"=snp_count,
"from"=snp_pos1,
"to"=TO,
"avg_pct"=sum_pct))

#reset variable
snp_count=0
sum_pct=0
snp_pos1=group_subset[x,3]
Start_SNP=group_subset[x,1]
}

#upgrade x value
x <- x+1

}
}

final_table$avg_pct = final_table$avg_pct/final_table$nSNP

rownames(final_table) <- seq(1,length(row.names(final_table)))
return(final_table)
return(result_summary)
}

15 changes: 13 additions & 2 deletions detectRUNS/R/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,12 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa
mapChrom <- mappa[mappa$CHR==chromosome,]
print(paste("N.of SNP is",nrow(mapChrom)))

snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, pops)

# remove Number column
snpInRuns$Number <- NULL

krom <- subset(snpInRuns,CHR==chromosome)

p <- ggplot(data=krom, aes(x=POSITION/(10^6), y=PERCENTAGE, colour=BREED))
Expand Down Expand Up @@ -432,7 +437,13 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33,
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapChrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile)

pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, pops)

# remove Number column
snpInRuns$Number <- NULL

all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
Expand Down
4 changes: 2 additions & 2 deletions detectRUNS/man/snpInsideRunsCpp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading