From bd12ec45e54b31a10d86a0883da741ea8e121fea Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 4 Apr 2023 14:15:25 -0700 Subject: [PATCH 01/11] add visualize component endpoint and clean up --- R/schematic_rest_api.R | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/R/schematic_rest_api.R b/R/schematic_rest_api.R index 6fd4a57..656a052 100644 --- a/R/schematic_rest_api.R +++ b/R/schematic_rest_api.R @@ -315,6 +315,61 @@ storage_project_manifests <- function(asset_view, } +#' Get all the attributes associated with a specific data model component formatted as a dataframe +#' +#' @param schema_url A data model URL +#' @param component Component of the data model to explore +#' @param base_url URL to schematic API endpoint +#' @export + +visualize_component <- function(schema_url, + component = "DataFlow", + base_url = "https://schematic-dev.api.sagebionetworks.org") { + + # create api url + url <- paste0(base_url, "/v1/visualize/component") + + # set up parameters for httr::get + params = list( + `schema_url` = schema_url, + `component` = component, + `include_index` = "false" + ) + + # GET + res <- httr::GET(url = url, query = params) + + # check that application returns json + # even when json = TRUE, http_type = "text/csv" + # if (httr::http_type(res) != "application/json") { + # stop("API did not return json", call. = FALSE) + # } + + # pull out content from request + parsed <- suppressMessages(httr::content(res)) + + # if the api call returns an error + # surface error to user + if (httr::http_error(res)) { + stop( + sprintf( + "Schematic API request failed [%s]", + httr::status_code(res) + ), + call. = FALSE + ) + } + + # return a helpful object + structure( + list( + content = parsed, + response = res + ), + class = "schematic_api" + ) +} + # print method for schematic_api class of functions print.schematic_api <- function(x, ...) { str(x$content) From ff5244fdf1863ff4d596f3dff8886a93aa39a990 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 4 Apr 2023 14:15:42 -0700 Subject: [PATCH 02/11] document --- NAMESPACE | 1 + man/model_submit.Rd | 2 +- man/visualize_component.Rd | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 man/visualize_component.Rd diff --git a/NAMESPACE b/NAMESPACE index 72b8502..5d9c9fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ export(style_dashboard) export(true_false_icon) export(update_data_flow_manifest) export(update_dfs_manifest) +export(visualize_component) import(shiny) importFrom(golem,activate_js) importFrom(golem,add_resource_path) diff --git a/man/model_submit.Rd b/man/model_submit.Rd index 03b80c3..959c1ad 100644 --- a/man/model_submit.Rd +++ b/man/model_submit.Rd @@ -11,7 +11,7 @@ model_submit( file_name, input_token, restrict_rules = TRUE, - manifest_record_type = "table", + manifest_record_type = "table_and_file", base_url = "https://schematic-dev.api.sagebionetworks.org", schema_url = "https://raw.githubusercontent.com/Sage-Bionetworks/data_flow/main/inst/data_flow_component.jsonld", diff --git a/man/visualize_component.Rd b/man/visualize_component.Rd new file mode 100644 index 0000000..3764765 --- /dev/null +++ b/man/visualize_component.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schematic_rest_api.R +\name{visualize_component} +\alias{visualize_component} +\title{Get all the attributes associated with a specific data model component formatted as a dataframe} +\usage{ +visualize_component( + schema_url, + component = "DataFlow", + base_url = "https://schematic-dev.api.sagebionetworks.org" +) +} +\arguments{ +\item{schema_url}{A data model URL} + +\item{component}{Component of the data model to explore} + +\item{base_url}{URL to schematic API endpoint} +} +\description{ +Get all the attributes associated with a specific data model component formatted as a dataframe +} From 840129919f4d28a68e25bad6de71f4ed989e6c9b Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 4 Apr 2023 14:16:01 -0700 Subject: [PATCH 03/11] add visualize_component test --- tests/testthat/test-schematic-api.R | 38 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/testthat/test-schematic-api.R b/tests/testthat/test-schematic-api.R index 539b41a..b295c24 100644 --- a/tests/testthat/test-schematic-api.R +++ b/tests/testthat/test-schematic-api.R @@ -3,9 +3,9 @@ # VARIABLES ############################################################################# # FAIR DEMO DATA PROJECT A -asset_view <- "syn50896957" -project_id <- "syn50896931" -dataset_id <- "syn51219090" +asset_view <- "syn50896957" # FAIR Demo All Projects, Files and Folders +project_id <- "syn50896931" # FAIR Demo Project A +dataset_id <- "syn51219090" # DataFlowStatusDFATesting input_token <- Sys.getenv("SYNAPSE_PAT") base_url <- Sys.getenv("SCHEMATIC_BASE_URL_AWS") testing_manifest_path <- "../../inst/testing/synapse_storage_manifest_dataflow.csv" @@ -14,36 +14,36 @@ schema_url <- "https://raw.githubusercontent.com/Sage-Bionetworks/data_flow/main # TEST API ############################################################################## test_that("storage_projects successfully returns a schematic_api object", { - sp <- try(storage_projects(asset_view = asset_view, + res <- try(storage_projects(asset_view = asset_view, input_token = input_token), silent = FALSE) - expect_true(class(sp) == "schematic_api") + expect_true(class(res) == "schematic_api") }) test_that("storage_project_datasets successfully returns a schematic_api object", { - spd <- try(storage_project_datasets(asset_view = asset_view, + res <- try(storage_project_datasets(asset_view = asset_view, project_id = project_id, input_token = input_token, base_url = base_url), silent = FALSE) - expect_true(class(spd) == "schematic_api") + expect_true(class(res) == "schematic_api") }) test_that("manifest_download successfully returns a schematic_api object", { - md <- try(manifest_download(input_token = input_token, + res <- try(manifest_download(input_token = input_token, asset_view = asset_view, dataset_id = dataset_id, base_url = base_url), silent = FALSE) - expect_true(class(md) == "schematic_api") + expect_true(class(res) == "schematic_api") }) test_that("model_submit successfully returns a schematic_api object", { - s <- try(model_submit(data_type = NULL, + res <- try(model_submit(data_type = NULL, asset_view = asset_view, dataset_id = dataset_id, file_name = testing_manifest_path, @@ -55,16 +55,26 @@ test_that("model_submit successfully returns a schematic_api object", { use_schema_label = TRUE), silent = FALSE) - expect_true(class(s) == "schematic_api") + expect_true(class(res) == "schematic_api") }) test_that("storage_project_manifests successfully returns a schematic_api object", { - spm <- try(storage_project_manifests(asset_view, + res <- try(storage_project_manifests(asset_view, project_id, input_token, base_url), silent = FALSE) - expect_true(class(spm) == "schematic_api") + expect_true(class(res) == "schematic_api") -}) \ No newline at end of file +}) + +test_that("visualize/component successfully returns a schematic_api object", { + res <- try(visualize_component(schema_url = schema_url, + component = "DataFlow", + base_url = base_url), + silent = FALSE) + + expect_true(class(res) == "schematic_api") + +}) From 2915f274357f6a06dd2aa788b86b54b59a01b89b Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 11 Apr 2023 17:23:46 -0700 Subject: [PATCH 04/11] add error message --- R/api_wrappers.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/api_wrappers.R b/R/api_wrappers.R index 431d88d..a6de119 100644 --- a/R/api_wrappers.R +++ b/R/api_wrappers.R @@ -93,6 +93,7 @@ calculate_items_per_manifest <- function(df, base_url = base_url) }, error=function(e) { + message(e) return(NULL) } ) From 5cf4938d1d3f2301c02975dd52403079629a4c67 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 11 Apr 2023 17:24:57 -0700 Subject: [PATCH 05/11] add more manifest checks for update script --- R/manifest.R | 145 +++++++++++++++++++++++++++ tests/testthat/test-manifest.R | 174 ++++++++++++++++++++++----------- 2 files changed, 260 insertions(+), 59 deletions(-) diff --git a/R/manifest.R b/R/manifest.R index 077420f..7a04de8 100644 --- a/R/manifest.R +++ b/R/manifest.R @@ -271,4 +271,149 @@ update_data_flow_manifest <- function(asset_view, } else { print("No updates to manifest required at this time") } +} + +#' Update manifest with new datasets found in Synapse +#' +#' @param dataflow_manifest A dataFlow manifest +#' @param get_all_manifests_out The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset. +#' @param asset_view ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml) +#' @param input_token Synapse PAT +#' @param base_url Base URL of schematic API (Defaults to AWS version) + +update_manifest_add_datasets <- function(dataflow_manifest, + get_all_manifests_out, + asset_view, + input_token, + base_url) { + + # check for new datasets by entityId + new_datasets <- get_all_manifests_out[!get_all_manifests_out$entityId %in% dataflow_manifest$entityId,] + + # if there are new datasets... + if (nrow(new_datasets) > 0) { + + print(paste0(nrow(new_datasets), " new dataset(s) found on Synapse")) + + # calculate number of items in each manifest + num_items <- tryCatch( + { + calculate_items_per_manifest(df = new_datasets, + asset_view = asset_view, + input_token = input_token, + base_url = base_url) + }, + error = function(e) { + message("get_all_manifests failed") + message(e) + } + ) + + # fill data flow manifest rows for missing datasets + # FIXME: Remove hardcoded column names + # This function will break if dataflow schema changes + # Source column names from schema? + new_datasets$release_scheduled <- rep("Not Applicable", nrow(new_datasets)) + new_datasets$embargo <- rep("Not Applicable", nrow(new_datasets)) + new_datasets$standard_compliance <- rep(FALSE, nrow(new_datasets)) + new_datasets$data_portal <- rep(FALSE, nrow(new_datasets)) + new_datasets$released <- rep(FALSE, nrow(new_datasets)) + new_datasets$num_items <- num_items + + # remove uuid col (prep for rbind) + if (any(grepl("Uuid", names(dataflow_manifest)))) { + uuid_idx <- grep("Uuid", names(dataflow_manifest)) + dataflow_manifest <- dataflow_manifest[, -uuid_idx] + } + + # bind together new dataset rows and data flow manifest + dataflow_manifest <- rbind(dataflow_manifest, new_datasets) + + + # rearrange data flow manifest + dataflow_manifest <- dataflow_manifest %>% + dplyr::group_by(contributor) %>% + dplyr::arrange(contributor) + } + + return(dataflow_manifest) + +} + +#' Remove datasets that are no longer found in Synapse +#' +#' @param dataflow_manifest A dataFlow manifest +#' @param get_all_manifests_out The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset. +#' @param asset_view ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml) +#' @param input_token Synapse PAT +#' @param base_url Base URL of schematic API (Defaults to AWS version) + +update_manifest_remove_datasets <- function(dataflow_manifest, + get_all_manifests_out, + asset_view, + input_token, + base_url) { + + # check for removed datasets + remove_idx <- dataflow_manifest$entityId %in% get_all_manifests_out$entityId + + # if any of the rows are flagged for removal print a message and remove from manifest + if (any(!remove_idx)) { + n_remove <- sum(!remove_idx) + print(paste0(n_remove, " dataset(s) removed from Synapse")) + + dataflow_manifest <- dataflow_manifest[remove_idx,] + } + + return(dataflow_manifest) +} + +#' Update dataFlow manifest when dataset folder name changes +#' +#' @param dataflow_manifest A dataFlow manifest +#' @param get_all_manifests_out The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset. +#' @param asset_view ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml) +#' @param update_column Column name of the column to be updated +#' @param recalc_num_items TRUE/FALSE if there is an item to be updated, should the manifest +#' @param input_token Synapse PAT +#' @param base_url Base URL of schematic API (Defaults to AWS version) +#' +#' @export + +update_manifest_column <- function(dataflow_manifest, + get_all_manifests_out, + update_column, + asset_view, + recalc_num_items = FALSE, + input_token, + base_url) { + + # arrange by entityId + dataflow_manifest <- dplyr::arrange(dataflow_manifest, entityId) + get_all_manifests <- dplyr::arrange(get_all_manifests_out, entityId) + + # get logical index of which items have changed + idx <- dataflow_manifest[[update_column]] != get_all_manifests[[update_column]] + + # if any items have changed update dataset type column + if (any(idx)) { + n_changed <- sum(idx) + print(paste0("Making ", n_changed, " update(s) to ", update_column, " column")) + dataflow_manifest$dataset_name[idx] <- get_all_manifests_out$dataset_name[idx] + + # if recalc_num_items = TRUE recalculate number of items in the manifest for updated items + if (recalc_num_items) { + dataflow_manifest$num_items[idx] <- calculate_items_per_manifest(df = dataflow_manifest[idx,], + asset_view = asset_view, + input_token = input_token, + base_url = base_url) + } + } + + # rearrange data flow manifest + dataflow_manifest <- dataflow_manifest %>% + dplyr::group_by(contributor) %>% + dplyr::arrange(contributor) + + return(dataflow_manifest) } \ No newline at end of file diff --git a/tests/testthat/test-manifest.R b/tests/testthat/test-manifest.R index f85159d..e344bf1 100644 --- a/tests/testthat/test-manifest.R +++ b/tests/testthat/test-manifest.R @@ -1,73 +1,129 @@ +# CREATE TESTING VARIABLES + +base_url <- base_url <- Sys.getenv("SCHEMATIC_BASE_URL_AWS") +input_token <- Sys.getenv("SYNAPSE_PAT") +asset_view <- "syn50896957" + +# mock synapse / submission ready manifest +manifest_synapse <- data.frame(Component = "DataFlow", + contributor = "FAIR demo data", + entityId = "syn123", + dataset_name = "biospecimen", + dataset = "Biospecimen", + num_items = "1", + release_scheduled = "2050-01-01", + embargo = "Not Applicable", + standard_compliance = FALSE, + data_portal = FALSE, + released = FALSE) + +# mock dfa ready manifest +manifest_dfa <- data.frame(Component = "DataFlow", + contributor = as.factor("FAIR demo data"), + entityId = "syn123", + dataset_name = "biospecimen", + dataset = as.factor("Biospecimen"), + num_items = 1, + release_scheduled = as.Date("2050-01-01"), + embargo = as.Date(NA), + standard_compliance = FALSE, + data_portal = FALSE, + released = FALSE) + +# mock get_all_manifests output +get_all_manifests_same <- manifest_synapse[,c("Component", "contributor", "entityId", "dataset_name", "dataset")] + # read in test config config <- jsonlite::read_json("../../inst/testing/datatable_dashboard_config.json") -# make mock synapse / submission ready manifest -manifest_synapse <- data.frame(Component = rep("DataFlow", 4), - contributor = rep("schematic - main", 4), - entityId = paste0("syn", 1:4), - dataset_name = paste0("dataset_name", 1:4), - dataset = rep("Biospecimen", 4), - num_items = c(rep("Not Applicable", 3), "1"), - release_scheduled = c(rep("Not Applicable", 2), - rep("2050-01-01", 2)), - embargo = c(rep("Not Applicable", 2), - rep("2050-01-01", 2)), - standard_compliance = rep(FALSE, 4), - data_portal = rep(FALSE, 4), - released = rep(FALSE, 4)) +# TESTS -# make mock dfa ready manifest -manifest_dfa <- data.frame(Component = rep("DataFlow", 4), - contributor = rep(as.factor("schematic - main"), 4), - entityId = paste0("syn", 1:4), - dataset_name = paste0("dataset_name", 1:4), - dataset = rep(as.factor("Biospecimen"), 4), - num_items = c(rep(NA, 3), 1), - release_scheduled = c(rep(as.Date(NA), 2), - rep(as.Date("2050-01-01"), 2)), - embargo = c(rep(as.Date(NA), 2), - rep(as.Date("2050-01-01"), 2)), - standard_compliance = rep(FALSE, 4), - data_portal = rep(FALSE, 4), - released = rep(FALSE, 4)) +test_that("prep_manifest_dfa modifies manifest in expected way", { + expect_equal(prep_manifest_dfa(manifest_synapse, config), + manifest_dfa) +}) -# tests +test_that("prep_manifest_submit modifies manifest in expected way", { + expect_equal(prep_manifest_submit(manifest_dfa, config), + manifest_synapse) +}) -test_that("update_dfs_manifest", { - - dfs_updates <- list(release_scheduled = as.Date("2022-01-01"), - embargo = as.Date("2022-01-01"), - standard_compliance = TRUE, - data_portal = TRUE, - released = TRUE) +test_that("update_manifest_add_datasets adds new datasets to manifest", { - selected_datasets_df <- data.frame(id = c("syn1"), name = "dataset_name1") + # add a dataset + get_all_manifests_add_dataset <- rbind(get_all_manifests_same, + data.frame(Component = "DataFlow", + contributor = "FAIR demo data", + entityId = "syn44539618", + dataset_name = "MockComponent", + dataset = "Patient")) - expected_updated_row <- data.frame(Component = "DataFlow", - contributor = as.factor("schematic - main"), - entityId = "syn1", - dataset_name = paste0("dataset_name", 1), - dataset = as.factor("Biospecimen"), - num_items = as.numeric(NA), - release_scheduled = as.Date("2022-01-01"), - embargo = as.Date("2022-01-01"), - standard_compliance = TRUE, - data_portal = TRUE, - released = TRUE) - expected_df <- rbind(expected_updated_row, manifest_dfa[-1,]) + manifest <- update_manifest_add_datasets(dataflow_manifest = manifest_synapse, + get_all_manifests_out = get_all_manifests_add_dataset, + asset_view = asset_view, + input_token = input_token, + base_url = base_url) - expect_equal(update_dfs_manifest(dfs_manifest = manifest_dfa, - dfs_updates = dfs_updates, - selected_datasets_df = selected_datasets_df), - expected_df) + expect_equal(get_all_manifests_add_dataset$entityId, manifest$entityId) }) -test_that("prep_manifest_dfa", { - expect_equal(prep_manifest_dfa(manifest_synapse, config), - manifest_dfa) +test_that("update_manifest_remove_datasets removes datasets from manifest", { + + # remove a dataset + get_all_manifests_remove_dataset <- get_all_manifests_same[-2, ] + + manifest <- update_manifest_remove_datasets(dataflow_manifest = manifest_synapse, + get_all_manifests_out = get_all_manifests_remove_dataset, + asset_view = asset_view, + input_token = input_token, + base_url = base_url) + + expect_equal(get_all_manifests_remove_dataset$entityId, manifest$entityId) }) -test_that("prep_manifest_submit", { - expect_equal(prep_manifest_submit(manifest_dfa, config), - manifest_synapse) +test_that("update_manifest_column updates items in a selected column", { + + # change the dataset_name column + get_all_manifests_change_dataset_name <- get_all_manifests_same + get_all_manifests_change_dataset_name$dataset_name[1] <- "new_dataset_name" + + manifest <- update_manifest_column(dataflow_manifest = manifest_synapse, + get_all_manifests_out = get_all_manifests_change_dataset_name, + update_column = "dataset_name", + asset_view = asset_view, + recalc_num_items = FALSE, + input_token = input_token, + base_url = base_url) + + expect_equal(manifest$dataset_name, "new_dataset_name") }) + +# FIXME: FIX THIS TEST +# test_that("update_dfs_manifest", { +# +# dfs_updates <- list(release_scheduled = as.Date("2022-01-01"), +# embargo = as.Date("2022-01-01"), +# standard_compliance = TRUE, +# data_portal = TRUE, +# released = TRUE) +# +# selected_datasets_df <- data.frame(id = c("syn123"), name = "biospecimen") +# +# expected_updated_row <- data.frame(Component = "DataFlow", +# contributor = as.factor("FAIR demo data"), +# entityId = "syn123", +# dataset_name = "biospecimen", +# dataset = as.factor("Biospecimen"), +# num_items = as.numeric(NA), +# release_scheduled = as.Date("2022-01-01"), +# embargo = as.Date("2022-01-01"), +# standard_compliance = TRUE, +# data_portal = TRUE, +# released = TRUE) +# expected_df <- rbind(expected_updated_row, manifest_dfa[-1,]) +# +# expect_equal(update_dfs_manifest(dfs_manifest = manifest_dfa, +# dfs_updates = dfs_updates, +# selected_datasets_df = selected_datasets_df), +# expected_df) +# }) From d8b364247fd3e54f9d9910d8c1a54d31be6fdc23 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 11 Apr 2023 17:26:44 -0700 Subject: [PATCH 06/11] dont export update manifest helper function --- R/manifest.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/manifest.R b/R/manifest.R index 7a04de8..6c3d7b3 100644 --- a/R/manifest.R +++ b/R/manifest.R @@ -377,8 +377,6 @@ update_manifest_remove_datasets <- function(dataflow_manifest, #' @param recalc_num_items TRUE/FALSE if there is an item to be updated, should the manifest #' @param input_token Synapse PAT #' @param base_url Base URL of schematic API (Defaults to AWS version) -#' -#' @export update_manifest_column <- function(dataflow_manifest, get_all_manifests_out, From 7be6fdb0e6d8cf1a27ae34f1c67e1adb04275004 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 11 Apr 2023 17:27:09 -0700 Subject: [PATCH 07/11] redocument --- man/model_submit.Rd | 2 +- man/update_manifest_add_datasets.Rd | 28 +++++++++++++++++++++ man/update_manifest_column.Rd | 34 ++++++++++++++++++++++++++ man/update_manifest_remove_datasets.Rd | 28 +++++++++++++++++++++ 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 man/update_manifest_add_datasets.Rd create mode 100644 man/update_manifest_column.Rd create mode 100644 man/update_manifest_remove_datasets.Rd diff --git a/man/model_submit.Rd b/man/model_submit.Rd index 03b80c3..959c1ad 100644 --- a/man/model_submit.Rd +++ b/man/model_submit.Rd @@ -11,7 +11,7 @@ model_submit( file_name, input_token, restrict_rules = TRUE, - manifest_record_type = "table", + manifest_record_type = "table_and_file", base_url = "https://schematic-dev.api.sagebionetworks.org", schema_url = "https://raw.githubusercontent.com/Sage-Bionetworks/data_flow/main/inst/data_flow_component.jsonld", diff --git a/man/update_manifest_add_datasets.Rd b/man/update_manifest_add_datasets.Rd new file mode 100644 index 0000000..78ec608 --- /dev/null +++ b/man/update_manifest_add_datasets.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{update_manifest_add_datasets} +\alias{update_manifest_add_datasets} +\title{Update manifest with new datasets found in Synapse} +\usage{ +update_manifest_add_datasets( + dataflow_manifest, + get_all_manifests_out, + asset_view, + input_token, + base_url +) +} +\arguments{ +\item{dataflow_manifest}{A dataFlow manifest} + +\item{get_all_manifests_out}{The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset.} + +\item{asset_view}{ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml)} + +\item{input_token}{Synapse PAT} + +\item{base_url}{Base URL of schematic API (Defaults to AWS version)} +} +\description{ +Update manifest with new datasets found in Synapse +} diff --git a/man/update_manifest_column.Rd b/man/update_manifest_column.Rd new file mode 100644 index 0000000..b98f20c --- /dev/null +++ b/man/update_manifest_column.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{update_manifest_column} +\alias{update_manifest_column} +\title{Update dataFlow manifest when dataset folder name changes} +\usage{ +update_manifest_column( + dataflow_manifest, + get_all_manifests_out, + update_column, + asset_view, + recalc_num_items = FALSE, + input_token, + base_url +) +} +\arguments{ +\item{dataflow_manifest}{A dataFlow manifest} + +\item{get_all_manifests_out}{The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset.} + +\item{update_column}{Column name of the column to be updated} + +\item{asset_view}{ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml)} + +\item{recalc_num_items}{TRUE/FALSE if there is an item to be updated, should the manifest} + +\item{input_token}{Synapse PAT} + +\item{base_url}{Base URL of schematic API (Defaults to AWS version)} +} +\description{ +Update dataFlow manifest when dataset folder name changes +} diff --git a/man/update_manifest_remove_datasets.Rd b/man/update_manifest_remove_datasets.Rd new file mode 100644 index 0000000..bafad7b --- /dev/null +++ b/man/update_manifest_remove_datasets.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{update_manifest_remove_datasets} +\alias{update_manifest_remove_datasets} +\title{Remove datasets that are no longer found in Synapse} +\usage{ +update_manifest_remove_datasets( + dataflow_manifest, + get_all_manifests_out, + asset_view, + input_token, + base_url +) +} +\arguments{ +\item{dataflow_manifest}{A dataFlow manifest} + +\item{get_all_manifests_out}{The output of get_all_manifests. Also can be a dataframe that includes Component, contributor, entityId, dataset_name, and dataset.} + +\item{asset_view}{ID of view listing all project data assets. For example, for Synapse this would be the Synapse ID of the fileview listing all data assets for a given project.(i.e. master_fileview in config.yml)} + +\item{input_token}{Synapse PAT} + +\item{base_url}{Base URL of schematic API (Defaults to AWS version)} +} +\description{ +Remove datasets that are no longer found in Synapse +} From 38490ae3e03af72b9b4f4f4478d7ca055a1dfb9c Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Mon, 17 Apr 2023 15:28:53 -0700 Subject: [PATCH 08/11] fix bug in update_manifest_column, refactor update_data_flow_manifest --- R/manifest.R | 160 +++++++++++++++++++++++++-------------------------- 1 file changed, 79 insertions(+), 81 deletions(-) diff --git a/R/manifest.R b/R/manifest.R index 6c3d7b3..148c698 100644 --- a/R/manifest.R +++ b/R/manifest.R @@ -166,7 +166,7 @@ update_data_flow_manifest <- function(asset_view, print(paste0("Checking asset view ", asset_view, " for updates")) print(paste0("Getting data flow status manifest")) # get current data flow manifest - dfs_manifest <- tryCatch( + dataflow_manifest_obj <- tryCatch( { manifest_download(asset_view = asset_view, dataset_id = manifest_dataset_id, @@ -179,8 +179,10 @@ update_data_flow_manifest <- function(asset_view, } ) + dataflow_manifest <- dataflow_manifest_obj$content + # get all manifests for each storage project - print("Getting all manifests") + print(paste0("Getting all manifests under asset view ", asset_view, " from Synapse")) synapse_manifests <- tryCatch( { get_all_manifests(asset_view = asset_view, @@ -194,83 +196,79 @@ update_data_flow_manifest <- function(asset_view, } ) - print("Comparing data flow status manifest to current manifest list") + print("Checking data flow manifest for updates") + + # check synapse for new datasets + dataflow_manifest_updated <- update_manifest_add_datasets(dataflow_manifest = dataflow_manifest, + get_all_manifests_out = synapse_manifests, + asset_view = asset_view, + input_token = input_token, + base_url = base_url) + + # check synapse for removed datasets + dataflow_manifest_updated <- update_manifest_remove_datasets(dataflow_manifest = dataflow_manifest_updated, + get_all_manifests_out = synapse_manifests, + asset_view = asset_view, + input_token = input_token, + base_url = base_url) + + # check synapse for updates to dataset_name column + dataflow_manifest_updated <- update_manifest_column(dataflow_manifest = dataflow_manifest_updated, + get_all_manifests_out = synapse_manifests, + update_column = "dataset_name", + asset_view = asset_view, + recalc_num_items = FALSE, + input_token = input_token, + base_url = base_url) + + # check synapse for updates to dataset column + dataflow_manifest_updated <- update_manifest_column(dataflow_manifest = dataflow_manifest_updated, + get_all_manifests_out = synapse_manifests, + update_column = "dataset", + asset_view = asset_view, + recalc_num_items = TRUE, + input_token = input_token, + base_url = base_url) + + # compare updated dataflow manifest to initial manifest + # if uuid remove + if(any(grepl("Uuid", names(dataflow_manifest)))) { + idx <- grep("Uuid", names(dataflow_manifest)) + dataflow_manifest <- dataflow_manifest[,-idx] + } - # compare recent pull of all manifests to data flow manifest - missing_datasets_idx <- !synapse_manifests$entityId %in% dfs_manifest$content$entityId - missing_datasets <- synapse_manifests[missing_datasets_idx,] + changes_made <- !identical(dataflow_manifest, dataflow_manifest_updated) - # if there are missing datasets calculate number of items for each dataset and add in missing information - if (nrow(missing_datasets) > 0) { - - print(paste0(nrow(missing_datasets), " new dataset(s) found. Updating data flow status manifest")) - - # calculate number of items in each manifest - num_items <- tryCatch( - { - calculate_items_per_manifest(df = missing_datasets, - asset_view = asset_view, - input_token = input_token, - base_url = base_url) - }, - error = function(e) { - message("get_all_manifests failed") - message(e) + # if changes have been made submit to synapse + if (changes_made) { + # submit to synapse + # data_type = NULL until LP can fix model/submit endpoint for large manifests + # If no datatype indicated no validation will be done + message("submitting manifest to Synapse") + + # create manifest directory if it doesn't exist yet + if (!file.exists("./manifest/")) { + dir.create("./manifest/") } - ) - - # fill dfs manifest rows for missing datasets - # FIXME: Remove hardcoded column names - # This function will break if dataflow schema changes - # Source column names from schema? - missing_datasets$release_scheduled <- rep("Not Applicable", nrow(missing_datasets)) - missing_datasets$embargo <- rep("Not Applicable", nrow(missing_datasets)) - missing_datasets$standard_compliance <- rep(FALSE, nrow(missing_datasets)) - missing_datasets$data_portal <- rep(FALSE, nrow(missing_datasets)) - missing_datasets$released <- rep(FALSE, nrow(missing_datasets)) - missing_datasets$num_items <- num_items - - # remove uuid if present - if (any(names(dfs_manifest$content) == "Uuid")) { - uuid_idx <- grep("Uuid", names(dfs_manifest$content)) - dfs_manifest$content <- dfs_manifest$content[,-uuid_idx] - } - - # tack on missing datasets to end of dfs_status_manifest - updated_dfs_manifest <- rbind(dfs_manifest$content, missing_datasets) - - # sort dataframe so that contributor is grouped - updated_dfs_manifest <- updated_dfs_manifest %>% - dplyr::group_by(contributor) %>% - dplyr::arrange(contributor) - - # submit to synapse - # data_type = NULL until LP can fix model/submit endpoint for large manifests - # If no datatype indicated no validation will be done - message("submitting manifest to Synapse") - - # create manifest directory if it doesn't exist yet - if (!file.exists("./manifest/")) { - dir.create("./manifest/") - } - - # write to csv for submission - file_path <- "./manifest/synapse_storage_manifest_dataflow.csv" - write.csv(updated_dfs_manifest, file_path, row.names = FALSE) - - # submit to synapse - model_submit(data_type = NULL, - asset_view = asset_view, - dataset_id = manifest_dataset_id, - file_name = file_path, - restrict_rules = TRUE, - input_token = input_token, - manifest_record_type = "table_and_file", - base_url = base_url, - schema_url = "https://raw.githubusercontent.com/Sage-Bionetworks/data_flow/main/inst/data_flow_component.jsonld") + + # write to csv for submission + file_path <- "./manifest/synapse_storage_manifest_dataflow.csv" + write.csv(dataflow_manifest_updated, file_path, row.names = FALSE) + + # submit to synapse + model_submit(data_type = NULL, + asset_view = asset_view, + dataset_id = manifest_dataset_id, + file_name = file_path, + restrict_rules = TRUE, + input_token = input_token, + manifest_record_type = "table_and_file", + base_url = base_url, + schema_url = "https://raw.githubusercontent.com/Sage-Bionetworks/data_flow/main/inst/data_flow_component.jsonld") } else { print("No updates to manifest required at this time") } + } #' Update manifest with new datasets found in Synapse @@ -304,7 +302,7 @@ update_manifest_add_datasets <- function(dataflow_manifest, base_url = base_url) }, error = function(e) { - message("get_all_manifests failed") + message("num_items calculation failed") message(e) } ) @@ -388,23 +386,23 @@ update_manifest_column <- function(dataflow_manifest, # arrange by entityId dataflow_manifest <- dplyr::arrange(dataflow_manifest, entityId) - get_all_manifests <- dplyr::arrange(get_all_manifests_out, entityId) + get_all_manifests_out <- dplyr::arrange(get_all_manifests_out, entityId) # get logical index of which items have changed - idx <- dataflow_manifest[[update_column]] != get_all_manifests[[update_column]] + idx <- dataflow_manifest[,update_column] != get_all_manifests_out[, update_column] # if any items have changed update dataset type column if (any(idx)) { n_changed <- sum(idx) print(paste0("Making ", n_changed, " update(s) to ", update_column, " column")) - dataflow_manifest$dataset_name[idx] <- get_all_manifests_out$dataset_name[idx] + dataflow_manifest[idx, update_column] <- get_all_manifests_out[idx, update_column] # if recalc_num_items = TRUE recalculate number of items in the manifest for updated items if (recalc_num_items) { dataflow_manifest$num_items[idx] <- calculate_items_per_manifest(df = dataflow_manifest[idx,], - asset_view = asset_view, - input_token = input_token, - base_url = base_url) + asset_view = asset_view, + input_token = input_token, + base_url = base_url) } } @@ -414,4 +412,4 @@ update_manifest_column <- function(dataflow_manifest, dplyr::arrange(contributor) return(dataflow_manifest) -} \ No newline at end of file +} From 16c34fc6134d06e13ef58ed4c37c3be63a0dc370 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Mon, 17 Apr 2023 15:29:20 -0700 Subject: [PATCH 09/11] add handling for Not Applicable and NA --- R/api_wrappers.R | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/R/api_wrappers.R b/R/api_wrappers.R index a6de119..9496e90 100644 --- a/R/api_wrappers.R +++ b/R/api_wrappers.R @@ -46,11 +46,17 @@ get_all_manifests <- function(asset_view, if (nrow(manifests$content) > 0) { # pull together in a dataframe - return(data.frame(Component = rep("DataFlow", nrow(manifests$content)), - contributor = rep(sp_name, nrow(manifests$content)), - entityId = manifests$content$dataset_id, - dataset_name = manifests$content$folder_name, - dataset = manifests$content$data_type)) + df <- data.frame(Component = rep("DataFlow", nrow(manifests$content)), + contributor = rep(sp_name, nrow(manifests$content)), + entityId = manifests$content$dataset_id, + dataset_name = manifests$content$folder_name, + dataset = manifests$content$data_type) + + # update empty cells to "Not Applicable" + df[ df == "" ] <- "Not Applicable" + + return(df) + } else { return(NULL) } @@ -74,11 +80,11 @@ calculate_items_per_manifest <- function(df, asset_view, input_token, base_url) { - + sapply(1:nrow(df), function(i) { # dataset == "" indicates that there is no manifest - if (df$dataset[i] == "") { + if (df$dataset[i] == "Not Applicable"| df$dataset[i] == "" | is.na(df$dataset[i])) { manifest_nrow <- "Not Applicable" @@ -101,7 +107,7 @@ calculate_items_per_manifest <- function(df, # if no manifest is downloaded, return NA # otherwise count rows and return nrow manifest_nrow <- ifelse(is.null(manifest$content), "Not Applicable", nrow(manifest$content)) - } + } return(manifest_nrow) }) From 1e6b33d9306e01b78eec959e7df9abdd5e5c4e21 Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 18 Apr 2023 14:27:29 -0700 Subject: [PATCH 10/11] tweak functions so that manifest only submits when updated --- R/manifest.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/manifest.R b/R/manifest.R index 148c698..1a17440 100644 --- a/R/manifest.R +++ b/R/manifest.R @@ -120,7 +120,7 @@ generate_data_flow_manifest_skeleton <- function(asset_view, # count rows in each manifest listed if (calc_num_items) { - num_items <- calculate_items_per_manifest(get_all_manifests_out = dfs_manifest, + num_items <- calculate_items_per_manifest(df = dfs_manifest, asset_view = asset_view, input_token = input_token, base_url = base_url) @@ -163,8 +163,15 @@ update_data_flow_manifest <- function(asset_view, input_token, base_url) { + # if uuid remove + if(any(grepl("Uuid", names(dataflow_manifest)))) { + idx <- grep("Uuid", names(dataflow_manifest)) + dataflow_manifest <- dataflow_manifest[,-idx] + } + print(paste0("Checking asset view ", asset_view, " for updates")) print(paste0("Getting data flow status manifest")) + # get current data flow manifest dataflow_manifest_obj <- tryCatch( { @@ -231,11 +238,6 @@ update_data_flow_manifest <- function(asset_view, base_url = base_url) # compare updated dataflow manifest to initial manifest - # if uuid remove - if(any(grepl("Uuid", names(dataflow_manifest)))) { - idx <- grep("Uuid", names(dataflow_manifest)) - dataflow_manifest <- dataflow_manifest[,-idx] - } changes_made <- !identical(dataflow_manifest, dataflow_manifest_updated) @@ -327,14 +329,13 @@ update_manifest_add_datasets <- function(dataflow_manifest, # bind together new dataset rows and data flow manifest dataflow_manifest <- rbind(dataflow_manifest, new_datasets) - # rearrange data flow manifest dataflow_manifest <- dataflow_manifest %>% dplyr::group_by(contributor) %>% dplyr::arrange(contributor) } - return(dataflow_manifest) + return(data.frame(dataflow_manifest)) } @@ -411,5 +412,5 @@ update_manifest_column <- function(dataflow_manifest, dplyr::group_by(contributor) %>% dplyr::arrange(contributor) - return(dataflow_manifest) + return(data.frame(dataflow_manifest)) } From 061a35396fd3eae77e055fa39d5bda25682b5d8f Mon Sep 17 00:00:00 2001 From: lakikowolfe Date: Tue, 18 Apr 2023 14:41:07 -0700 Subject: [PATCH 11/11] move uuid removal further down script --- R/manifest.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/manifest.R b/R/manifest.R index 1a17440..e1ffd79 100644 --- a/R/manifest.R +++ b/R/manifest.R @@ -163,12 +163,6 @@ update_data_flow_manifest <- function(asset_view, input_token, base_url) { - # if uuid remove - if(any(grepl("Uuid", names(dataflow_manifest)))) { - idx <- grep("Uuid", names(dataflow_manifest)) - dataflow_manifest <- dataflow_manifest[,-idx] - } - print(paste0("Checking asset view ", asset_view, " for updates")) print(paste0("Getting data flow status manifest")) @@ -188,6 +182,12 @@ update_data_flow_manifest <- function(asset_view, dataflow_manifest <- dataflow_manifest_obj$content + # if uuid remove + if(any(grepl("Uuid", names(dataflow_manifest)))) { + idx <- grep("Uuid", names(dataflow_manifest)) + dataflow_manifest <- dataflow_manifest[,-idx] + } + # get all manifests for each storage project print(paste0("Getting all manifests under asset view ", asset_view, " from Synapse")) synapse_manifests <- tryCatch(