diff --git a/R/semsim.R b/R/semsim.R index e4e3a5f..270e536 100644 --- a/R/semsim.R +++ b/R/semsim.R @@ -213,6 +213,8 @@ cosine_similarity <- function(subsumer_mat = NA, terms = NULL, ...) { #' greater than any of its superclasses. If a function, it must accept parameter #' `x` as the vector of term IRIs and return a vector of frequencies (_not_ #' IC scores) for the terms. The default is to use function [term_freqs()]. +#' Subsumer terms with zero or missing (NA) frequency will be omitted from +#' the calculation. #' @param wt_args list, named parameters for the function calculating term #' frequencies. Ignored if `wt` is not a function. For the default `wt` #' function [term_freqs()], the main parameters are `as` and `corpus`. @@ -248,7 +250,14 @@ resnik_similarity <- function(subsumer_mat = NA, terms = NULL, ..., if (missing(wt) || is.function(wt)) { wt_args$x <- rownames(subsumer_mat) wt <- do.call(wt, wt_args) - wt[wt == 0] <- 1 + # Terms with frequency zero should not occur in the subsumer matrix, so + # if there are any, they either shouldn't have been a subsumer, or they + # didn't yield a count. Either way, remove them from the computation. + rowsToRemove <- is.na(wt) | wt == 0 + if (any(rowsToRemove)) { + wt <- wt[! rowsToRemove] + subsumer_mat <- subsumer_mat[! rowsToRemove,] + } # we assume we got frequencies, turn into IC wt <- -log(wt, base = base) } diff --git a/man/similarity.Rd b/man/similarity.Rd index 930ebdc..4d3b9d4 100644 --- a/man/similarity.Rd +++ b/man/similarity.Rd @@ -36,7 +36,9 @@ scores, though any score will work for which a higher value means higher information content, and where a term will always have a score equal to or greater than any of its superclasses. If a function, it must accept parameter \code{x} as the vector of term IRIs and return a vector of frequencies (\emph{not} -IC scores) for the terms. The default is to use function \code{\link[=term_freqs]{term_freqs()}}.} +IC scores) for the terms. The default is to use function \code{\link[=term_freqs]{term_freqs()}}. +Subsumer terms with zero or missing (NA) frequency will be omitted from the +calculation.} \item{wt_args}{list, named parameters for the function calculating term frequencies. Ignored if \code{wt} is not a function. For the default \code{wt} diff --git a/tests/testthat/test-semsim.R b/tests/testthat/test-semsim.R index ec95918..d4472d2 100644 --- a/tests/testthat/test-semsim.R +++ b/tests/testthat/test-semsim.R @@ -84,9 +84,9 @@ test_that("Resnik similarity", { termICs <- -log10(term_freqs(phens$id, as = "phenotype", corpus = "taxa")) testthat::expect_equivalent(diag(sm.ic), termICs) - subs.ics <- -log10(term_freqs(rownames(subs.mat), - as = "phenotype", corpus = "taxa")) - sm.ic2 <- resnik_similarity(subs.mat, wt = subs.ics) + tfreqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa") + sm.ic2 <- resnik_similarity(subs.mat[! (is.na(tfreqs) | tfreqs == 0), ], + wt = -log10(tfreqs[! (is.na(tfreqs) | tfreqs == 0)])) testthat::expect_equal(sm.ic, sm.ic2) }) @@ -151,6 +151,9 @@ test_that("profile similarity with Resnik", { })) freqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa") + toKeep <- ! (is.na(freqs) | freqs == 0) + freqs <- freqs[toKeep] + subs.mat <- subs.mat[toKeep,] sm <- profile_similarity(resnik_similarity, subs.mat, wt = -log10(freqs), f = phens.f) testthat::expect_equal(colnames(sm), levels(phens.f))