Skip to content

Commit

Permalink
Merge pull request #32 from bioinformatics-ptp/issue-31
Browse files Browse the repository at this point in the history
⚡ refactor tableRuns to better manage memory
  • Loading branch information
filippob authored May 9, 2023
2 parents 466f55a + 6503f49 commit a35f7a3
Show file tree
Hide file tree
Showing 15 changed files with 627 additions and 236 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,5 @@ jobs:
working-directory: ./detectRUNS

- name: Test coverage
run: covr::codecov()
run: covr::codecov(path="detectRUNS/")
shell: Rscript {0}
working-directory: ./detectRUNS
6 changes: 3 additions & 3 deletions detectRUNS/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ Package: detectRUNS
Type: Package
Title: Detect Runs of Homozygosity and Runs of Heterozygosity in
Diploid Genomes
Version: 0.9.6.9000
Date: 2020-12-20
Version: 0.9.6.9004
Date: 2022-04-21
Authors@R: c(
person("Filippo","Biscarini", email="[email protected]", role=c("aut", "cre")),
person("Paolo","Cozzi", email="paolo.cozzi@ptp.it", role="aut"),
person("Paolo","Cozzi", email="paolo.cozzi@ibba.cnr.it", role="aut"),
person("Giustino","Gaspa", email="[email protected]", role="aut"),
person("Gabriele","Marras", email="[email protected]", role="aut")
)
Expand Down
53 changes: 50 additions & 3 deletions detectRUNS/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#'
#' @param runsChrom R object (dataframe) with results per chromosome
#' @param mapChrom R map object with SNP per chromosome
#' @param genotypeFile genotype (.ped) file location
#' @param pops R object (dataframe) with populations see \code{\link{readPOPCpp}}
#'
#' @return dataframe with counts per SNP in runs (per population)
#'
Expand All @@ -192,7 +192,54 @@ consecutiveRunsCpp <- function(indGeno, individual, mapFile, ROHet = TRUE, minSN
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
snpInsideRunsCpp <- function(runsChrom, mapChrom, genotypeFile) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, genotypeFile)
snpInsideRunsCpp <- function(runsChrom, mapChrom, pops) {
.Call('_detectRUNS_snpInsideRunsCpp', PACKAGE = 'detectRUNS', runsChrom, mapChrom, pops)
}

#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results and returns a subset of
#' the runs most commonly found in the group/population. The parameter
#' \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.5) that controls the desired
#' proportion of individuals carrying that run (e.g. 50\%)
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome, number of SNP
#' included in the run and average percentage of SNPs in run
#' are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile,
#' windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1 / 10000
#' )
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package = "detectRUNS")
#' runsDF <- readExternalRuns(inputFile = runsFile, program = "detectRUNS")
#'
#' table <- tableRuns(
#' runs = runsDF, genotypeFile = genotypeFile,
#' mapFile = mapFile, threshold = 0.5)
#'
#' @useDynLib detectRUNS
#' @importFrom Rcpp sourceCpp
#'
tableRuns <- function(runs, genotypeFile, mapFile, threshold = 0.5) {
.Call('_detectRUNS_tableRuns', PACKAGE = 'detectRUNS', runs, genotypeFile, mapFile, threshold)
}

178 changes: 9 additions & 169 deletions detectRUNS/R/Stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
chromosomeLength <- function(mapFile){
# read mapfile
mappa <- readMapFile(mapFile)

maps<-mappa[mappa$POSITION != 0, ] #delete chromosome 0

# defining NULL variables to avoid warning messages
Expand Down Expand Up @@ -355,7 +355,7 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){

message("Calculating SNPs inside ROH")
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")

#read map file
mappa <- readMapFile(mapFile)

Expand All @@ -378,184 +378,24 @@ summaryRuns <- function(runs, mapFile, genotypeFile, Class=2, snpInRuns=FALSE){
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
}

pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, pops)

# remove Number column
snpInRuns$Number <- NULL

return(result_summary)
}


#' Function to retrieve most common runs in the population
#'
#' This function takes in input either the run results or the output from
#' the function \code{snpInsideRuns} (proportion of times a SNP is inside a run)
#' in the population/group, and returns a subset of the runs most commonly
#' found in the group/population. The parameter \code{threshold} controls the definition
#' of most common (e.g. in at least 50\%, 70\% etc. of the sampled individuals)
#'
#' @param genotypeFile Plink ped file (for SNP position)
#' @param mapFile Plink map file (for SNP position)
#' @param runs R object (dataframe) with results on detected runs
#' @param threshold value from 0 to 1 (default 0.7) that controls the desired
#' proportion of individuals carrying that run (e.g. 70\%)
#' @param SnpInRuns dataframe with the proportion of times each SNP falls inside a
#' run in the population (output from \code{snpInsideRuns})
#'
#' @return A dataframe with the most common runs detected in the sampled individuals
#' (the group/population, start and end position of the run, chromosome, number of SNP
#' included in the run and average percentage of SNPs in run
#' are reported in the output dataframe)
#' @export
#'
#' @examples
#' # getting map and ped paths
#' genotypeFile <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package = "detectRUNS")
#' mapFile <- system.file("extdata", "Kijas2016_Sheep_subset.map", package = "detectRUNS")
#'
#' # calculating runs of Homozygosity
#' \dontrun{
#' # skipping runs calculation
#' runs <- slidingRUNS.run(genotypeFile, mapFile, windowSize = 15, threshold = 0.1, minSNP = 15,
#' ROHet = FALSE, maxOppositeGenotype = 1, maxMiss = 1, minLengthBps = 100000, minDensity = 1/10000)
#' }
#' # loading pre-calculated data
#' runsFile <- system.file("extdata", "Kijas2016_Sheep_subset.sliding.csv", package="detectRUNS")
#' runsDF = readExternalRuns(inputFile = runsFile, program = 'detectRUNS')
#'
#' tableRuns(runs = runsDF, genotypeFile = genotypeFile, mapFile = mapFile, threshold = 0.5)
#'

tableRuns <- function(runs=NULL,SnpInRuns=NULL,genotypeFile, mapFile, threshold = 0.5) {

#set a threshold
threshold_used=threshold*100
message(paste('Threshold used:',threshold_used))

# read map file
mappa <- readMapFile(mapFile)

if(!is.null(runs) & is.null(SnpInRuns)){
message('I found only Runs data frame. GOOD!')

#change colnames in runs file
names(runs) <- c("POPULATION","IND","CHROMOSOME","COUNT","START","END","LENGTH")

#Start calculation % SNP in ROH
message("Calculation % SNP in ROH") #FILIPPO
all_SNPinROH <- data.frame("SNP_NAME"=character(),
"CHR"=integer(),
"POSITION"=numeric(),
"COUNT"=integer(),
"BREED"=factor(),
"PERCENTAGE"=numeric(),
stringsAsFactors=FALSE)

# create progress bar
total <- length(unique(runs$CHROMOSOME))
message(paste('Chromosome founds: ',total)) #FILIPPO
n=0
pb <- txtProgressBar(min = 0, max = total, style = 3)

#SNP in ROH
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapKrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapKrom, genotypeFile)
all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
}
close(pb)

result_summary=append(result_summary,list(SNPinRun = all_SNPinROH))
message("Calculation % SNP in ROH finish") #FILIPPO
} else if (is.null(runs) & !is.null(SnpInRuns)) {
message('I found only SNPinRuns data frame. GOOD!')
all_SNPinROH=SnpInRuns
} else{
stop('You gave me Runs and SNPinRuns! Please choose one!')
}

#consecutive number
all_SNPinROH$Number <- seq(1,length(all_SNPinROH$PERCENTAGE))

#final data frame
final_table <- data.frame("GROUP"=character(0),"Start_SNP"=character(0),"End_SNP"=character(0),
"chrom"=character(0),"nSNP"=integer(0),"from"=integer(0),"to"=integer(0), "avg_pct"=numeric(0))


#vector of breeds
group_list=as.vector(unique(all_SNPinROH$BREED))

for (grp in group_list){
message(paste('checking: ',grp))

#create subset for group/thresold
group_subset=as.data.frame(all_SNPinROH[all_SNPinROH$BREED %in% c(grp) & all_SNPinROH$PERCENTAGE > threshold_used,])

#print(group_subset)

#variable
old_pos=group_subset[1,7]
snp_pos1=group_subset[1,3]
Start_SNP=group_subset[1,1]
snp_count=0
sum_pct = 0

x=2
while(x <= length(rownames(group_subset))) {

snp_count = snp_count + 1
new_pos=group_subset[x,7]
old_pos=group_subset[x-1,7]
chr_old=group_subset[x-1,2]
chr_new =group_subset[x,2]
sum_pct = sum_pct + group_subset[x-1,"PERCENTAGE"]

diff=new_pos-old_pos

if ((diff > 1) | (chr_new != chr_old) | x==length(rownames(group_subset))) {
if (x==length(rownames(group_subset))){
end_SNP=group_subset[x,1]
TO=group_subset[x,3]
}else{
end_SNP=group_subset[x-1,1]
TO=group_subset[x-1,3]
}

final_table <- rbind.data.frame(final_table,final_table=data.frame("Group"= group_subset[x-1,5],
"Start_SNP"=Start_SNP,
"End_SNP"=end_SNP,
"chrom"=group_subset[x-1,2],
"nSNP"=snp_count,
"from"=snp_pos1,
"to"=TO,
"avg_pct"=sum_pct))

#reset variable
snp_count=0
sum_pct=0
snp_pos1=group_subset[x,3]
Start_SNP=group_subset[x,1]
}

#upgrade x value
x <- x+1

}
}

final_table$avg_pct = final_table$avg_pct/final_table$nSNP

rownames(final_table) <- seq(1,length(row.names(final_table)))
return(final_table)
return(result_summary)
}

15 changes: 13 additions & 2 deletions detectRUNS/R/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,12 @@ plot_SnpsInRuns <- function(runs, genotypeFile, mapFile, savePlots=FALSE, separa
mapChrom <- mappa[mappa$CHR==chromosome,]
print(paste("N.of SNP is",nrow(mapChrom)))

snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, genotypeFile)
pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom, mapChrom, pops)

# remove Number column
snpInRuns$Number <- NULL

krom <- subset(snpInRuns,CHR==chromosome)

p <- ggplot(data=krom, aes(x=POSITION/(10^6), y=PERCENTAGE, colour=BREED))
Expand Down Expand Up @@ -432,7 +437,13 @@ plot_manhattanRuns <- function(runs, genotypeFile, mapFile, pct_threshold=0.33,
for (chrom in sort(unique(runs$CHROMOSOME))) {
runsChrom <- runs[runs$CHROMOSOME==chrom,]
mapChrom <- mappa[mappa$CHR==chrom,]
snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, genotypeFile)

pops <- readPOPCpp(genotypeFile)
snpInRuns <- snpInsideRunsCpp(runsChrom,mapChrom, pops)

# remove Number column
snpInRuns$Number <- NULL

all_SNPinROH <- rbind.data.frame(all_SNPinROH,snpInRuns)
n=n+1
setTxtProgressBar(pb, n)
Expand Down
4 changes: 2 additions & 2 deletions detectRUNS/man/snpInsideRunsCpp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a35f7a3

Please sign in to comment.