diff --git a/NEWS.md b/NEWS.md index cb3ba38197..f230bd663b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # mlr 2.18.0.9000 -- Internal changes only. +- Warning if `fix.factors.prediction = TRUE` causes the generation of NAs for new factor levels in prediction. +- Clear error message if prediction of wrapped learner has not the same length as `newdata`. +- Internal changes. # mlr 2.18.0 diff --git a/R/predict.R b/R/predict.R index 3a0de7270c..5da64383da 100644 --- a/R/predict.R +++ b/R/predict.R @@ -144,6 +144,12 @@ predict.WrappedModel = function(object, task, newdata, subset = NULL, ...) { dump = addClasses(get("last.dump", envir = .GlobalEnv), "mlr.dump") } } + # did the prediction fail otherwise? + np = nrow(p) + if (is.null(np)) np = length(p) + if (np != nrow(newdata)) { + stopf("predictLearner for %s has returned %i predictions instead of %i!", learner$id, np, nrow(newdata)) + } } if (missing(task)) { ids = NULL diff --git a/R/predictLearner.R b/R/predictLearner.R index 1f326dcd55..3e8d4f1825 100644 --- a/R/predictLearner.R +++ b/R/predictLearner.R @@ -55,8 +55,13 @@ predictLearner2 = function(.learner, .model, .newdata, ...) { ns = intersect(colnames(.newdata), ns) fls = fls[ns] if (length(ns) > 0L) { - .newdata[ns] = mapply(factor, x = .newdata[ns], - levels = fls, SIMPLIFY = FALSE) + safe_factor = function(x, levels) { + if (length(setdiff(levels(x), levels)) > 0) { + warning("fix.factors.prediction = TRUE produced NAs because of new factor levels in prediction data.") + } + factor(x, levels) + } + .newdata[ns] = mapply(safe_factor, x = .newdata[ns], levels = fls, SIMPLIFY = FALSE) } } p = predictLearner(.learner, .model, .newdata, ...) diff --git a/man/getFeatureImportance.Rd b/man/getFeatureImportance.Rd index 0d6f63232f..89c0b47145 100644 --- a/man/getFeatureImportance.Rd +++ b/man/getFeatureImportance.Rd @@ -54,7 +54,7 @@ This is identical to randomForest. \item randomForestSRC \cr This method can calculate feature importance for various measures. By default the Breiman-Cutler permutation method is used. See -\code{\link[randomForestSRC:vimp.rfsrc]{randomForestSRC::vimp()}} for details. +\code{\link[randomForestSRC:vimp]{randomForestSRC::vimp()}} for details. \item ranger \cr Supports both measures mentioned above for the randomForest learner. Note, that you need to specifically set the learners parameter diff --git a/tests/testthat/test_base_resample.R b/tests/testthat/test_base_resample.R index bd63c22a75..628490b370 100644 --- a/tests/testthat/test_base_resample.R +++ b/tests/testthat/test_base_resample.R @@ -186,7 +186,8 @@ test_that("resample printer respects show.info", { }) test_that("resample drops unseen factors in predict data set", { - data = data.frame(a = c("a", "b", "a", "b", "a", "c"), + data = data.frame( + a = c("a", "b", "a", "b", "a", "c"), b = c(1, 1, 2, 2, 2, 1), trg = c("a", "b", "a", "b", "a", "b"), stringsAsFactors = TRUE) @@ -202,6 +203,12 @@ test_that("resample drops unseen factors in predict data set", { lrn = makeLearner("classif.logreg", fix.factors.prediction = TRUE) model = train(lrn, subsetTask(task, 1:4)) - predict(model, subsetTask(task, 5:6)) - resample(lrn, task, resinst) + expect_warning(predict(model, subsetTask(task, 5:6)), "produced NAs because of new factor levels") + expect_warning(resample(lrn, task, resinst), "produced NAs because of new factor levels") + + # do it manually + train_task = makeClassifTask("unseen.factors", data[1:4,], "trg", fixup = "quiet") # quiet becasue + # we get dropped factors warning (which we want here) + model = train(lrn, train_task) + expect_warning(predict(model, newdata = data[5:6,]), "produced NAs because of new factor levels") }) diff --git a/tests/testthat/test_classif_ksvm.R b/tests/testthat/test_classif_ksvm.R index 9a3b8953c1..dbaa2559c7 100644 --- a/tests/testthat/test_classif_ksvm.R +++ b/tests/testthat/test_classif_ksvm.R @@ -48,3 +48,21 @@ test_that("classif_ksvm", { testCV("classif.ksvm", multiclass.df, multiclass.target, tune.train = tt, tune.predict = tp, parset = list(kernel = "polydot", degree = 3, offset = 2, scale = 1.5)) }) + +test_that("classif_ksvm produces error for new factor levels on predict", { + # https://github.com/mlr-org/mlr/issues/2771 + train_data = data.frame( + A = sample(c("A","B"), 10, TRUE), + B = factor(sample(c("A", "B"), 10, replace = T)) + ) + test_data = data.frame( + A = sample(c("A","B"), 10, TRUE), + B = factor(sample(c("A", "B","C"), 10, replace = T)) + ) + lrn = makeLearner("classif.ksvm", fix.factors.prediction = TRUE) + train_task = makeClassifTask(data = train_data, target = "A") + model = train(lrn, train_task) + expect_warning({ + expect_error(predict(model, newdata = test_data), "has returned .+ instead of 10") + }, "produced NAs because of new factor levels") +})