Fixes handling of term freqs that are missing or zero

The current database and API returns subsumer terms for which there is no count for certain corpora. Erroneously, the KB API currently returns a zero count for those, when it should return no count. This change should handle both cases, and removes those subsumers from the calculation. Fixes #153.
phenoscape · Feb 2, 2021 · 26d6eab · 26d6eab
1 parent d661479
commit 26d6eab
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 5 deletions.
diff --git a/R/semsim.R b/R/semsim.R
@@ -213,6 +213,8 @@ cosine_similarity <- function(subsumer_mat = NA, terms = NULL, ...) {
 #'   greater than any of its superclasses. If a function, it must accept parameter
 #'   `x` as the vector of term IRIs and return a vector of frequencies (_not_
 #'   IC scores) for the terms. The default is to use function [term_freqs()].
+#'   Subsumer terms with zero or missing (NA) frequency will be omitted from
+#'   the calculation.
 #' @param wt_args list, named parameters for the function calculating term
 #'   frequencies. Ignored if `wt` is not a function. For the default `wt`
 #'   function [term_freqs()], the main parameters are `as` and `corpus`. 
@@ -248,7 +250,14 @@ resnik_similarity <- function(subsumer_mat = NA, terms = NULL, ...,
   if (missing(wt) || is.function(wt)) {
     wt_args$x <- rownames(subsumer_mat)
     wt <- do.call(wt, wt_args)
-    wt[wt == 0] <- 1
+    # Terms with frequency zero should not occur in the subsumer matrix, so
+    # if there are any, they either shouldn't have been a subsumer, or they
+    # didn't yield a count. Either way, remove them from the computation.
+    rowsToRemove <- is.na(wt) | wt == 0
+    if (any(rowsToRemove)) {
+      wt <- wt[! rowsToRemove]
+      subsumer_mat <- subsumer_mat[! rowsToRemove,]
+    }
     # we assume we got frequencies, turn into IC
     wt <- -log(wt, base = base)
   }

diff --git a/man/similarity.Rd b/man/similarity.Rd
diff --git a/tests/testthat/test-semsim.R b/tests/testthat/test-semsim.R
@@ -84,9 +84,9 @@ test_that("Resnik similarity", {
   termICs <- -log10(term_freqs(phens$id, as = "phenotype", corpus = "taxa"))
   testthat::expect_equivalent(diag(sm.ic), termICs)
 
-  subs.ics <- -log10(term_freqs(rownames(subs.mat),
-                                as = "phenotype", corpus = "taxa"))
-  sm.ic2 <- resnik_similarity(subs.mat, wt = subs.ics)
+  tfreqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  sm.ic2 <- resnik_similarity(subs.mat[! (is.na(tfreqs) | tfreqs == 0), ],
+                              wt = -log10(tfreqs[! (is.na(tfreqs) | tfreqs == 0)]))
   testthat::expect_equal(sm.ic, sm.ic2)
 })
 
@@ -151,6 +151,9 @@ test_that("profile similarity with Resnik", {
   }))
 
   freqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  toKeep <- ! (is.na(freqs) | freqs == 0)
+  freqs <- freqs[toKeep]
+  subs.mat <- subs.mat[toKeep,]
   sm <- profile_similarity(resnik_similarity, subs.mat, wt = -log10(freqs),
                            f = phens.f)
   testthat::expect_equal(colnames(sm), levels(phens.f))