From 45ac04e89d97b79abbacf21bb6f03bec3f460a17 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 8 May 2019 12:25:49 +0100 Subject: [PATCH] initial ShogunOpenML class --- src/interfaces/swig/IO.i | 2 + src/shogun/base/SGObject.cpp | 1 - src/shogun/io/OpenMLFlow.cpp | 287 +++++++++++++++++++++++++++++------ src/shogun/io/OpenMLFlow.h | 135 +++++++++++----- 4 files changed, 342 insertions(+), 83 deletions(-) diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 4e59008adf4..b3094311ec3 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -26,6 +26,8 @@ %rename(MemoryMappedFile) CMemoryMappedFile; %shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::ShogunOpenML::flow_to_model) +%shared_ptr(shogun::ShogunOpenML::model_to_flow) %include %include diff --git a/src/shogun/base/SGObject.cpp b/src/shogun/base/SGObject.cpp index b91eff7f709..48ff54a3f64 100644 --- a/src/shogun/base/SGObject.cpp +++ b/src/shogun/base/SGObject.cpp @@ -1112,5 +1112,4 @@ std::string CSGObject::string_enum_reverse_lookup( return p.second == enum_value; }); return enum_map_it->first; - } diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index a7ef3279f97..b18d80bae4b 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -15,6 +15,14 @@ using namespace shogun; using namespace rapidjson; +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) { // adapted from https://stackoverflow.com/a/5780603 @@ -30,13 +38,16 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) return 0; } +/* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +/* DATA API */ const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; const char* OpenMLReader::data_features = "/data/features/{}"; const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ const char* OpenMLReader::flow_file = "/flow/{}"; const std::unordered_map @@ -84,25 +95,16 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) if (code != CURLE_OK) { // TODO: call curl_easy_cleanup(curl_handle) ? - SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(code)) } - // else - // { - // long response_code; - // curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, - //&response_code); if (response_code == 200) return; - // else - // { - // if (response_code == 181) - // SG_SERROR("Unknown flow. The flow with the given ID was not - // found in the database.") else if (response_code == 180) - // SG_SERROR("") SG_SERROR("Server code: %d\n", response_code) - // } - // } } #endif // HAVE_CURL +/** + * Checks the returned flow in JSON format + * @param doc the parsed flow + */ static void check_flow_response(rapidjson::Document& doc) { if (SG_UNLIKELY(doc.HasMember("error"))) @@ -116,24 +118,36 @@ static void check_flow_response(rapidjson::Document& doc) REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericValue, i.e. string + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, + const GenericValue>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericObject, i.e. array + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericObject< - true, rapidjson::GenericValue>>& v, + const GenericObject< + true, GenericValue>>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); @@ -234,52 +248,235 @@ std::shared_ptr OpenMLFlow::from_file() return std::shared_ptr(); } +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(model), m_parameter(""), m_string_val(""){}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + + void on(bool* v) final + { + if (!is_null()) + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + if (!is_null()) + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float32_t result = std::strtof(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + if (!is_null()) + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float64_t result = std::strtod(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + if (!is_null()) + { + SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + floatmax_t result = std::strtold(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + + bool is_null() + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + void set_parameter_name(const std::string& name) + { + m_parameter = name; + } + + void set_string_value(const std::string& value) + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + std::shared_ptr obj; + if (factory_name == "machine") + obj = std::shared_ptr(machine(algo_name)); + else if (factory_name == "kernel") + obj = std::shared_ptr(kernel(algo_name)); + else if (factory_name == "distance") + obj = std::shared_ptr(distance(algo_name)); + else + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return obj; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else + SG_SERROR("Could not cast SGObject.\n") +} + std::shared_ptr ShogunOpenML::flow_to_model( std::shared_ptr flow, bool initialize_with_defaults) { - std::string name; - std::string val_as_string; - std::shared_ptr obj; auto params = flow->get_parameters(); auto components = flow->get_components(); auto class_name = get_class_info(flow->get_class_name()); auto module_name = std::get<0>(class_name); auto algo_name = std::get<1>(class_name); - if (module_name == "machine") - obj = std::shared_ptr(machine(algo_name)); - else if (module_name == "kernel") - obj = std::shared_ptr(kernel(algo_name)); - else if (module_name == "distance") - obj = std::shared_ptr(distance(algo_name)); - else - SG_SERROR("Unsupported factory \"%s\"\n", module_name.c_str()) + + auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); - auto put_lambda = [&obj, &name, &val_as_string](const auto& val) { - // cast value using type from get, i.e. val - auto val_ = char_to_scalar>( - val_as_string.c_str()); - obj->put(name, val_); - }; + std::unique_ptr visitor(new StringToShogun(obj)); if (initialize_with_defaults) { for (const auto& param : params) { Any any_val = obj_param.at(param.first)->get_value(); - name = param.first; - val_as_string = param.second.at("default_value"); - sg_any_dispatch(any_val, sg_all_typemap, put_lambda); + std::string name = param.first; + std::string val_as_string = param.second.at("default_value"); + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); } } for (const auto& component : components) { - CSGObject* a = - flow_to_model(component.second, initialize_with_defaults).get(); - // obj->put(component.first, a); + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); } + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + return obj; } @@ -306,15 +503,15 @@ ShogunOpenML::get_class_info(const std::string& class_name) if (std::next(it) == class_name.end()) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components.size() != 3) - SG_SERROR("Invalid class name format %s\n", class_name.c_str()) if (class_components[0] == "shogun") result = std::make_tuple(class_components[1], class_components[2]); else SG_SERROR( "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\"\n", + "required library is \"%s\".\n", class_components[0].c_str()) + if (class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) return result; } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8fc46594a08..8c00ffedb49 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -25,6 +25,9 @@ namespace shogun { + /** + * Reads OpenML streams which can be downloaded with this function. + */ class OpenMLReader { @@ -96,6 +99,7 @@ namespace shogun } private: + /** the raw buffer as a C++ string */ std::string m_curl_response_buffer; /** @@ -114,13 +118,18 @@ namespace shogun */ void openml_curl_error_helper(CURL* curl_handle, CURLcode code); + /** the user API key, not required for all requests */ std::string m_api_key; + /** the server path to get a response in XML format*/ static const char* xml_server; + /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server response format options: XML or JSON */ static const std::unordered_map m_format_options; + /** all the supported server options */ static const std::unordered_map m_request_options; @@ -135,24 +144,48 @@ namespace shogun static const char* flow_file; }; + /** + * Writes OpenML streams to the OpenML server. + */ class OpenMLWritter { public: OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; private: + /** the user API key, likely to be needed to write to OpenML */ std::string m_api_key; }; + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ class OpenMLFlow { public: + /** alias for component type, map of flows */ using components_type = std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ using parameters_type = std::unordered_map< std::string, std::unordered_map>; + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ OpenMLFlow( const std::string& name, const std::string& description, const std::string& model, components_type components, @@ -162,15 +195,39 @@ namespace shogun { } + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested flow is ill formed. + */ static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ static std::shared_ptr from_file(); + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ static void upload_flow(const std::shared_ptr& flow); + /** + * Dumps the OpenMLFlow to disk. + */ void dump(); + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ std::shared_ptr get_subflow(const std::string& name) { auto find_flow = m_components.find(name); @@ -200,62 +257,66 @@ namespace shogun #endif // SWIG private: + /** name field of the flow */ std::string m_name; + /** description field of the flow */ std::string m_description; + /** the class_name field of the flow */ std::string m_class_name; + /** the parameter field of the flow (optional) */ parameters_type m_parameters; + /** the components fields of the flow (optional) */ components_type m_components; }; -#ifndef SWIG - template - T char_to_scalar(const char* string_val) - { - SG_SERROR( - "No registered conversion from string to type \"s\"\n", - demangled_type().c_str()) - return 0; - } - - template <> - float32_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtof(string_val, &end); - } - - template <> - float64_t char_to_scalar(const char* string_val) + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask { - char* end; - return std::strtod(string_val, &end); - } - - template <> - floatmax_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtold(string_val, &end); - } - - template <> - bool char_to_scalar(const char* string_val) - { - return strcmp(string_val, "true"); - } - -#endif // SWIG + public: + OpenMLTask(); + }; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ class ShogunOpenML { public: + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ static std::shared_ptr flow_to_model( std::shared_ptr flow, bool initialize_with_defaults); + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ static std::shared_ptr model_to_flow(const std::shared_ptr& model); private: + /** + * Helper function to extract module/factory information from the class + * name field of OpenMLFlow. Throws an error either if the class name + * field is ill formed (i.e. not library.module.algorithm) or if the + * library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ static std::tuple get_class_info(const std::string& class_name); };