From 26d6eabd8aefed0354b8560f76c14bb00b85b524 Mon Sep 17 00:00:00 2001
From: Hilmar Lapp <hlapp@drycafe.net>
Date: Mon, 1 Feb 2021 16:46:10 -0500
Subject: [PATCH] Fixes handling of term freqs that are missing or zero

The current database and API returns subsumer terms for which there is
no count for certain corpora. Erroneously, the KB API currently returns
a zero count for those, when it should return no count. This change
should handle both cases, and removes those subsumers from the calculation.

Fixes #153.
---
 R/semsim.R                   | 11 ++++++++++-
 man/similarity.Rd            |  4 +++-
 tests/testthat/test-semsim.R |  9 ++++++---
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/R/semsim.R b/R/semsim.R
index e4e3a5f..270e536 100644
--- a/R/semsim.R
+++ b/R/semsim.R
@@ -213,6 +213,8 @@ cosine_similarity <- function(subsumer_mat = NA, terms = NULL, ...) {
 #'   greater than any of its superclasses. If a function, it must accept parameter
 #'   `x` as the vector of term IRIs and return a vector of frequencies (_not_
 #'   IC scores) for the terms. The default is to use function [term_freqs()].
+#'   Subsumer terms with zero or missing (NA) frequency will be omitted from
+#'   the calculation.
 #' @param wt_args list, named parameters for the function calculating term
 #'   frequencies. Ignored if `wt` is not a function. For the default `wt`
 #'   function [term_freqs()], the main parameters are `as` and `corpus`. 
@@ -248,7 +250,14 @@ resnik_similarity <- function(subsumer_mat = NA, terms = NULL, ...,
   if (missing(wt) || is.function(wt)) {
     wt_args$x <- rownames(subsumer_mat)
     wt <- do.call(wt, wt_args)
-    wt[wt == 0] <- 1
+    # Terms with frequency zero should not occur in the subsumer matrix, so
+    # if there are any, they either shouldn't have been a subsumer, or they
+    # didn't yield a count. Either way, remove them from the computation.
+    rowsToRemove <- is.na(wt) | wt == 0
+    if (any(rowsToRemove)) {
+      wt <- wt[! rowsToRemove]
+      subsumer_mat <- subsumer_mat[! rowsToRemove,]
+    }
     # we assume we got frequencies, turn into IC
     wt <- -log(wt, base = base)
   }
diff --git a/man/similarity.Rd b/man/similarity.Rd
index 930ebdc..4d3b9d4 100644
--- a/man/similarity.Rd
+++ b/man/similarity.Rd
@@ -36,7 +36,9 @@ scores, though any score will work for which a higher value means higher
 information content, and where a term will always have a score equal to or
 greater than any of its superclasses. If a function, it must accept parameter
 \code{x} as the vector of term IRIs and return a vector of frequencies (\emph{not}
-IC scores) for the terms. The default is to use function \code{\link[=term_freqs]{term_freqs()}}.}
+IC scores) for the terms. The default is to use function \code{\link[=term_freqs]{term_freqs()}}.
+Subsumer terms with zero or missing (NA) frequency will be omitted from the
+calculation.}
 
 \item{wt_args}{list, named parameters for the function calculating term
 frequencies. Ignored if \code{wt} is not a function. For the default \code{wt}
diff --git a/tests/testthat/test-semsim.R b/tests/testthat/test-semsim.R
index ec95918..d4472d2 100644
--- a/tests/testthat/test-semsim.R
+++ b/tests/testthat/test-semsim.R
@@ -84,9 +84,9 @@ test_that("Resnik similarity", {
   termICs <- -log10(term_freqs(phens$id, as = "phenotype", corpus = "taxa"))
   testthat::expect_equivalent(diag(sm.ic), termICs)
 
-  subs.ics <- -log10(term_freqs(rownames(subs.mat),
-                                as = "phenotype", corpus = "taxa"))
-  sm.ic2 <- resnik_similarity(subs.mat, wt = subs.ics)
+  tfreqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  sm.ic2 <- resnik_similarity(subs.mat[! (is.na(tfreqs) | tfreqs == 0), ],
+                              wt = -log10(tfreqs[! (is.na(tfreqs) | tfreqs == 0)]))
   testthat::expect_equal(sm.ic, sm.ic2)
 })
 
@@ -151,6 +151,9 @@ test_that("profile similarity with Resnik", {
   }))
 
   freqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  toKeep <- ! (is.na(freqs) | freqs == 0)
+  freqs <- freqs[toKeep]
+  subs.mat <- subs.mat[toKeep,]
   sm <- profile_similarity(resnik_similarity, subs.mat, wt = -log10(freqs),
                            f = phens.f)
   testthat::expect_equal(colnames(sm), levels(phens.f))