From 92b60b0acfe920865ee73cfafbf7aa5ba884d622 Mon Sep 17 00:00:00 2001 From: David Linke Date: Sat, 17 Aug 2024 16:13:06 +0200 Subject: [PATCH 1/2] Prepare adding shacl import --- docs/introduction.rst | 3 +- docs/packages/importers.rst | 10 + schema_automator/cli.py | 28 +++ schema_automator/importers/__init__.py | 1 + .../importers/shacl_import_engine.py | 227 ++++++++++++++++++ tests/resources/test_shacl_simple.ttl | 1 + tests/test_importers/test_shacl_importer.py | 41 ++++ 7 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 schema_automator/importers/shacl_import_engine.py create mode 100644 tests/resources/test_shacl_simple.ttl create mode 100644 tests/test_importers/test_shacl_importer.py diff --git a/docs/introduction.rst b/docs/introduction.rst index 06b93f4..c8fe220 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -27,10 +27,11 @@ Importing from alternative modeling frameworks See :ref:`importers` * OWL (but this only works for schema-style OWL) +* SHACL (in progress) * JSON-Schema * SQL DDL -In future other frameworks will be supported +In future other frameworks will be supported. Annotating schemas ------------------ diff --git a/docs/packages/importers.rst b/docs/packages/importers.rst index 0aa4546..bdeb1fa 100644 --- a/docs/packages/importers.rst +++ b/docs/packages/importers.rst @@ -42,6 +42,16 @@ Use robot to convert ahead of time: robot convert -i schemaorg.ttl -o schemaorg.ofn schemauto import-owl schemaorg.ofn +Importing from SHACL +-------------------- + +You can import from a SHACL shapes file. + +.. code-block:: + + schemauto import-shacl tests/resources/test_shacl_simple.ttl + + Importing from SQL ------------------ diff --git a/schema_automator/cli.py b/schema_automator/cli.py index bd76320..8e29b40 100644 --- a/schema_automator/cli.py +++ b/schema_automator/cli.py @@ -497,6 +497,34 @@ def import_rdfs(rdfsfile, output, metamodel_mappings, **args): schema = sie.convert(rdfsfile, **args) write_schema(schema, output) +@main.command() +@click.argument('shaclfile') +@output_option +@schema_name_option +@click.option('--input-type', '-I', + default='turtle', + help="Input format, eg. turtle") +@click.option('--identifier', '-I', help="Slot to use as identifier") +@click.option('--model-uri', help="Model URI prefix") +@click.option('--metamodel-mappings', + help="Path to metamodel mappings YAML dictionary") +@click.option('--output', '-o', help="Path to saved yaml schema") +def import_shacl(shaclfile, output, metamodel_mappings, **args): + """ + Import an SHACL profile to LinkML + + Example: + + schemauto import-shacl mymodel.shacl.ttl -o mymodel.yaml + """ + mappings_obj = None + if metamodel_mappings: + with open(metamodel_mappings) as f: + mappings_obj = yaml.safe_load(f) + sie = ShaclImportEngine(initial_metamodel_mappings=mappings_obj) + schema = sie.convert(shaclfile, **args) + write_schema(schema, output) + @main.command() @click.argument('rdffile') @output_option diff --git a/schema_automator/importers/__init__.py b/schema_automator/importers/__init__.py index 2011d25..fa187ac 100644 --- a/schema_automator/importers/__init__.py +++ b/schema_automator/importers/__init__.py @@ -3,3 +3,4 @@ from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine from schema_automator.importers.frictionless_import_engine import FrictionlessImportEngine from schema_automator.importers.cadsr_import_engine import CADSRImportEngine +from schema_automator.importers.shacl_import_engine import ShaclImportEngine diff --git a/schema_automator/importers/shacl_import_engine.py b/schema_automator/importers/shacl_import_engine.py new file mode 100644 index 0000000..39a1b98 --- /dev/null +++ b/schema_automator/importers/shacl_import_engine.py @@ -0,0 +1,227 @@ +import logging + +from linkml.utils.schema_builder import SchemaBuilder +from linkml_runtime import SchemaView +from linkml_runtime.linkml_model import ( + SchemaDefinition, + SlotDefinition, + ClassDefinition, +) + + +HTTP_SDO = Namespace("http://schema.org/") + +DEFAULT_METAMODEL_MAPPINGS = { + "is_a": [RDFS.subClassOf, SKOS.broader], + "domain_of": [HTTP_SDO.domainIncludes, SDO.domainIncludes], + "rangeIncludes": [HTTP_SDO.rangeIncludes, SDO.rangeIncludes], + "exact_mappings": [OWL.sameAs, HTTP_SDO.sameAs], + ClassDefinition.__name__: [RDFS.Class, OWL.Class, SKOS.Concept], + SlotDefinition.__name__: [ + RDF.Property, + OWL.ObjectProperty, + OWL.DatatypeProperty, + OWL.AnnotationProperty, + ], +} + + +@dataclass +class ShaclImportEngine(ImportEngine): + """ + An ImportEngine that takes SHACL and converts it to a LinkML schema + """ + + mappings: dict = None + initial_metamodel_mappings: Dict[str, List[URIRef]] = None + metamodel_mappings: Dict[str, List[URIRef]] = None + reverse_metamodel_mappings: Dict[URIRef, List[str]] = None + include_unmapped_annotations = False + metamodel = None + metamodel_schemaview: SchemaView = None + classdef_slots: List[str] = None + + def __post_init__(self): + sv = package_schemaview("linkml_runtime.linkml_model.meta") + self.metamodel_schemaview = sv + self.metamodel = sv + self.metamodel_mappings = defaultdict(list) + self.reverse_metamodel_mappings = defaultdict(list) + for k, vs in DEFAULT_METAMODEL_MAPPINGS.items(): + self.metamodel_mappings[k].extend(vs) + for v in vs: + self.reverse_metamodel_mappings[v].append(k) + if self.initial_metamodel_mappings: + for k, vs in self.initial_metamodel_mappings.items(): + if not isinstance(vs, list): + vs = [vs] + self.metamodel_mappings[k].extend(vs) + for v in vs: + self.reverse_metamodel_mappings[URIRef(v)].append(k) + logging.info(f"Adding mapping {k} -> {v}") + for e in sv.all_elements().values(): + mappings = [] + for ms in sv.get_mappings(e.name, expand=True).values(): + for m in ms: + uri = URIRef(m) + mappings.append(uri) + self.reverse_metamodel_mappings[uri].append(e.name) + self.metamodel_mappings[e.name] = mappings + self.defclass_slots = [s.name for s in sv.class_induced_slots(ClassDefinition.class_name)] + + def convert( + self, + file: str, + name: str = None, + format="turtle", + default_prefix: str = None, + model_uri: str = None, + identifier: str = None, + **kwargs, + ) -> SchemaDefinition: + """ + Converts an OWL schema-style ontology + + :param file: + :param name: + :param model_uri: + :param identifier: + :param kwargs: + :return: + """ + self.mappings = {} + g = Graph() + g.parse(file, format=format) + if name is not None and default_prefix is None: + default_prefix = name + if name is None: + name = default_prefix + if name is None: + name = "example" + sb = SchemaBuilder(name=name) + sb.add_defaults() + schema = sb.schema + for k, v in g.namespaces(): + if k == "schema" and v != "http://schema.org/": + continue + sb.add_prefix(k, v, replace_if_present=True) + if default_prefix is not None: + schema.default_prefix = default_prefix + if default_prefix not in schema.prefixes: + sb.add_prefix(default_prefix, model_uri, replace_if_present=True) + schema.id = schema.prefixes[default_prefix].prefix_reference + cls_slots = defaultdict(list) + props = [] + for rdfs_property_metaclass in self._rdfs_metamodel_iri( + SlotDefinition.__name__ + ): + for p in g.subjects(RDF.type, rdfs_property_metaclass): + props.append(p) + # implicit properties + for metap in ( + self.reverse_metamodel_mappings["domain_of"] + + self.reverse_metamodel_mappings["rangeIncludes"] + ): + for p, _, _o in g.triples((None, metap, None)): + props.append(p) + for p in set(props): + sn = self.iri_to_name(p) + init_dict = self._dict_for_subject(g, p) + if "domain_of" in init_dict: + for x in init_dict["domain_of"]: + cls_slots[x].append(sn) + del init_dict["domain_of"] + if "rangeIncludes" in init_dict: + init_dict["any_of"] = [{"range": x} for x in init_dict["rangeIncludes"]] + del init_dict["rangeIncludes"] + slot = SlotDefinition(sn, **init_dict) + slot.slot_uri = str(p.n3(g.namespace_manager)) + sb.add_slot(slot) + rdfs_classes = [] + for rdfs_class_metaclass in self._rdfs_metamodel_iri(ClassDefinition.__name__): + for s in g.subjects(RDF.type, rdfs_class_metaclass): + rdfs_classes.append(s) + # implicit classes + for metap in [RDFS.subClassOf]: + for s, _, o in g.triples((None, metap, None)): + rdfs_classes.append(s) + rdfs_classes.append(o) + for s in set(rdfs_classes): + cn = self.iri_to_name(s) + init_dict = self._dict_for_subject(g, s) + c = ClassDefinition(cn, **init_dict) + c.slots = cls_slots.get(cn, []) + c.class_uri = str(s.n3(g.namespace_manager)) + sb.add_class(c) + if identifier is not None: + id_slot = SlotDefinition(identifier, identifier=True, range="uriorcurie") + schema.slots[identifier] = id_slot + for c in schema.classes.values(): + if not c.is_a and not c.mixins: + if identifier not in c.slots: + c.slots.append(identifier) + return schema + + def _dict_for_subject(self, g: Graph, s: URIRef) -> Dict[str, Any]: + """ + Looks up triples for a subject and converts to dict using linkml keys. + + :param g: + :param p: + :return: + """ + init_dict = {} + for pp, obj in g.predicate_objects(s): + if pp == RDF.type: + continue + metaslot_name = self._element_from_iri(pp) + logging.debug(f"Mapping {pp} -> {metaslot_name}") + if metaslot_name not in self.defclass_slots: + continue + if metaslot_name is None: + logging.warning(f"Not mapping {pp}") + continue + if metaslot_name == "name": + metaslot_name = "title" + metaslot = self.metamodel.get_slot(metaslot_name) + v = self._object_to_value(obj, metaslot=metaslot) + metaslot_name_safe = underscore(metaslot_name) + if not metaslot or metaslot.multivalued: + if metaslot_name_safe not in init_dict: + init_dict[metaslot_name_safe] = [] + init_dict[metaslot_name_safe].append(v) + else: + init_dict[metaslot_name_safe] = v + return init_dict + + def _rdfs_metamodel_iri(self, name: str) -> List[URIRef]: + return self.metamodel_mappings.get(name, []) + + def _element_from_iri(self, iri: URIRef) -> str: + r = self.reverse_metamodel_mappings.get(iri, []) + if len(r) > 0: + if len(r) > 1: + logging.debug(f"Multiple mappings for {iri}: {r}") + return r[0] + + def _object_to_value(self, obj: Any, metaslot: SlotDefinition = None) -> Any: + if isinstance(obj, URIRef): + if metaslot.range == "uriorcurie" or metaslot.range == "uri": + return str(obj) + return self.iri_to_name(obj) + if isinstance(obj, Literal): + return obj.value + return obj + + def iri_to_name(self, v: URIRef) -> str: + n = self._as_name(v) + if n != v: + self.mappings[n] = v + return n + + def _as_name(self, v: URIRef): + v = str(v) + for sep in ["#", "/", ":"]: + if sep in v: + return v.split(sep)[-1] + return v diff --git a/tests/resources/test_shacl_simple.ttl b/tests/resources/test_shacl_simple.ttl new file mode 100644 index 0000000..9b869e0 --- /dev/null +++ b/tests/resources/test_shacl_simple.ttl @@ -0,0 +1 @@ +# tbw diff --git a/tests/test_importers/test_shacl_importer.py b/tests/test_importers/test_shacl_importer.py new file mode 100644 index 0000000..6970514 --- /dev/null +++ b/tests/test_importers/test_shacl_importer.py @@ -0,0 +1,41 @@ +import os +import pytest + +from linkml_runtime import SchemaView + +from schema_automator.importers.shacl_import_engine import ShaclImportEngine +from linkml.generators.yamlgen import YAMLGenerator + +from schema_automator.utils.schemautils import write_schema +from tests import INPUT_DIR, OUTPUT_DIR + +# TODO - Write tests (this is a copy of test_rdfs_importer) + +REPRO = os.path.join(INPUT_DIR, 'reproschema.ttl') +OUTSCHEMA = os.path.join(OUTPUT_DIR, 'reproschema-from-ttl.yaml') + + + +def test_from_shacl(): + """Test Shacl conversion.""" + oie = ShaclImportEngine() + + return + schema = oie.convert(REPRO, default_prefix='reproschema', identifier='id') + write_schema(schema, OUTSCHEMA) + # roundtrip + s = YAMLGenerator(OUTSCHEMA).serialize() + print(s[0:100]) + sv = SchemaView(OUTSCHEMA) + activity = sv.get_class("Activity") + assert activity + assert activity.name == "Activity" + assert activity.is_a == "CreativeWork" + slots = sv.class_induced_slots(activity.name) + assert len(slots) == 1 + slot = slots[0] + assert slot.name == "id" + + + + From 71c0099c807fe9fac04a3ef82c26b3e4a9ff10f0 Mon Sep 17 00:00:00 2001 From: David Linke Date: Tue, 8 Oct 2024 08:15:04 +0200 Subject: [PATCH 2/2] WIP (sync between work places) --- poetry.lock | 16 ++++----- pyproject.toml | 13 ++++--- .../importers/shacl_import_engine.py | 18 ++++++++-- tests/__init__.py | 3 -- tests/resources/shacl_simple.ttl | 34 +++++++++++++++++++ tests/test_importers/test_shacl_importer.py | 15 +++----- 6 files changed, 70 insertions(+), 29 deletions(-) create mode 100644 tests/resources/shacl_simple.ttl diff --git a/poetry.lock b/poetry.lock index dd714bc..3666b03 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "airium" @@ -3292,9 +3292,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3792,8 +3792,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4116,7 +4116,6 @@ description = "A pure Python implementation of the trie data structure." optional = false python-versions = "*" files = [ - {file = "PyTrie-0.4.0-py3-none-any.whl", hash = "sha256:f687c224ee8c66cda8e8628a903011b692635ffbb08d4b39c5f92b18eb78c950"}, {file = "PyTrie-0.4.0.tar.gz", hash = "sha256:8f4488f402d3465993fb6b6efa09866849ed8cda7903b50647b7d0342b805379"}, ] @@ -5095,7 +5094,7 @@ sphinx = ">=4.0" name = "sphinx-pdj-theme" version = "0.4.0" description = "A cool theme for sphinx documentation" -optional = false +optional = true python-versions = "*" files = [ {file = "sphinx-pdj-theme-0.4.0.tar.gz", hash = "sha256:4b86bfd8b8e20344db56aba13473f634286149fa0203d18e0437157f48c7e0fa"}, @@ -5167,7 +5166,7 @@ test = ["flake8", "mypy", "pytest"] name = "sphinxcontrib-mermaid" version = "0.9.2" description = "Mermaid diagrams in yours Sphinx powered docs" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "sphinxcontrib-mermaid-0.9.2.tar.gz", hash = "sha256:252ef13dd23164b28f16d8b0205cf184b9d8e2b714a302274d9f59eb708e77af"}, @@ -5959,10 +5958,9 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -docs = [] -mariadb = [] +docs = ["Sphinx", "sphinx-pdj-theme", "sphinxcontrib-mermaid"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "036cba73b6fd660157c70cb76be27a501017e8904b35c8d2ccb00d412bbba870" +content-hash = "9c29a704add4aaf15c228f9d6a81164390f060582bee85a89d266e2232c4b0ed" diff --git a/pyproject.toml b/pyproject.toml index 684e019..1f31453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,16 +54,19 @@ linkml-runtime = "^1.7.2" duckdb = "^0.10.1" numpy = "<2.0" +Sphinx = { version = ">=4.4.0", optional = true } +sphinx-pdj-theme = { version = ">=0.2.1", optional = true } +sphinx-click = ">=3.1.0" +sphinxcontrib-mermaid = { version = ">=0.9.2", optional = true } + [tool.poetry.dev-dependencies] pytest = ">=7.1.1" -Sphinx = ">=4.4.0" -sphinx-pdj-theme = ">=0.2.1" -sphinx-click = ">=3.1.0" -sphinxcontrib-mermaid = ">=0.9.2" myst-parser = "*" jupyter = ">=1.0.0" lxml = ">=4.9.1" +#mariadb = { version = "^1.3", optional = true } + [tool.poetry.group.llm.dependencies] llm = ">=0.12" @@ -82,7 +85,7 @@ extract-schema = "schema_automator.utils.schema_extractor:cli" [tool.poetry.extras] docs = ["Sphinx", "sphinx-pdj-theme", "sphinxcontrib-mermaid"] -mariadb = ["mariadb"] +#mariadb = ["mariadb"] [tool.codespell] # Ref: https://github.com/codespell-project/codespell#using-a-config-file diff --git a/schema_automator/importers/shacl_import_engine.py b/schema_automator/importers/shacl_import_engine.py index 39a1b98..352c9fa 100644 --- a/schema_automator/importers/shacl_import_engine.py +++ b/schema_automator/importers/shacl_import_engine.py @@ -1,13 +1,25 @@ +from collections import defaultdict import logging +from dataclasses import dataclass +from typing import Dict, List, Any + +from rdflib import Graph, RDF, OWL, URIRef, RDFS, SKOS, SDO, Namespace + +from funowl import Literal + from linkml.utils.schema_builder import SchemaBuilder from linkml_runtime import SchemaView +from linkml_runtime.utils.formatutils import underscore +from linkml_runtime.utils.introspection import package_schemaview from linkml_runtime.linkml_model import ( SchemaDefinition, SlotDefinition, ClassDefinition, ) +from schema_automator.importers.import_engine import ImportEngine +logger = logging.getLogger(__name__) HTTP_SDO = Namespace("http://schema.org/") @@ -80,7 +92,7 @@ def convert( **kwargs, ) -> SchemaDefinition: """ - Converts an OWL schema-style ontology + Converts an shacl shapes file :param file: :param name: @@ -110,6 +122,7 @@ def convert( if default_prefix not in schema.prefixes: sb.add_prefix(default_prefix, model_uri, replace_if_present=True) schema.id = schema.prefixes[default_prefix].prefix_reference + cls_slots = defaultdict(list) props = [] for rdfs_property_metaclass in self._rdfs_metamodel_iri( @@ -137,6 +150,7 @@ def convert( slot = SlotDefinition(sn, **init_dict) slot.slot_uri = str(p.n3(g.namespace_manager)) sb.add_slot(slot) + rdfs_classes = [] for rdfs_class_metaclass in self._rdfs_metamodel_iri(ClassDefinition.__name__): for s in g.subjects(RDF.type, rdfs_class_metaclass): @@ -201,7 +215,7 @@ def _element_from_iri(self, iri: URIRef) -> str: r = self.reverse_metamodel_mappings.get(iri, []) if len(r) > 0: if len(r) > 1: - logging.debug(f"Multiple mappings for {iri}: {r}") + logger.debug(f"Multiple mappings for {iri}: {r}") return r[0] def _object_to_value(self, obj: Any, metaslot: SlotDefinition = None) -> Any: diff --git a/tests/__init__.py b/tests/__init__.py index b092ca8..ad4619d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,9 +1,6 @@ import os -import pprint ROOT = os.path.abspath(os.path.dirname(__file__)) INPUT_DIR = os.path.join(ROOT, 'resources') OUTPUT_DIR = os.path.join(ROOT, 'outputs') MODEL_DIR = os.path.join(ROOT, 'test_models') - - diff --git a/tests/resources/shacl_simple.ttl b/tests/resources/shacl_simple.ttl new file mode 100644 index 0000000..9c3b08b --- /dev/null +++ b/tests/resources/shacl_simple.ttl @@ -0,0 +1,34 @@ +# example from http://book.validatingrdf.com/bookHtml011.html#ch050SHACLExample + +@prefix schema: . +@prefix sh: . +@prefix xsd: . +@prefix ex: . + +ex:UserShape a sh:NodeShape; + sh:targetClass ex:User ; + sh:property [ # Blank node 1 + sh:path schema:name ; + sh:minCount 1; + sh:maxCount 1; + sh:datatype xsd:string ; + ] ; + sh:property [ # Blank node 2 + sh:path schema:gender ; + sh:minCount 1; + sh:maxCount 1; + sh:or ( + [ sh:in (schema:Male schema:Female) ] + [ sh:datatype xsd:string] + ) + ] ; + sh:property [ # Blank node 3 + sh:path schema:birthDate ; + sh:maxCount 1; + sh:datatype xsd:date ; + ] ; + sh:property [ # Blank node 4 + sh:path schema:knows ; + sh:nodeKind sh:IRI ; + sh:class ex:User ; + ] . diff --git a/tests/test_importers/test_shacl_importer.py b/tests/test_importers/test_shacl_importer.py index 6970514..556c58f 100644 --- a/tests/test_importers/test_shacl_importer.py +++ b/tests/test_importers/test_shacl_importer.py @@ -11,18 +11,17 @@ # TODO - Write tests (this is a copy of test_rdfs_importer) -REPRO = os.path.join(INPUT_DIR, 'reproschema.ttl') -OUTSCHEMA = os.path.join(OUTPUT_DIR, 'reproschema-from-ttl.yaml') - +REPRO = os.path.join(INPUT_DIR, 'shacl_simple.ttl') +OUTSCHEMA = os.path.join(OUTPUT_DIR, 'user_from_shacl_simple2.yaml') def test_from_shacl(): """Test Shacl conversion.""" - oie = ShaclImportEngine() + sie = ShaclImportEngine() - return - schema = oie.convert(REPRO, default_prefix='reproschema', identifier='id') + schema = sie.convert(REPRO, default_prefix='usr', identifier='id') write_schema(schema, OUTSCHEMA) + return # roundtrip s = YAMLGenerator(OUTSCHEMA).serialize() print(s[0:100]) @@ -35,7 +34,3 @@ def test_from_shacl(): assert len(slots) == 1 slot = slots[0] assert slot.name == "id" - - - -