Update to work with new array schema and LoL pydantic generator (#14)

linkml · Sep 19, 2024 · bacaf30 · bacaf30
1 parent 2659925
commit bacaf30
Show file tree

Hide file tree

Showing 71 changed files with 2,042 additions and 3,806 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,5 +129,6 @@ dmypy.json
 .pyre/
 
 .DS_Store
-/my_temperature.zarr
-/my_temperature.h5
+/out/*
+/my_container.h5
+/my_container.zarr
diff --git a/README.md b/README.md
@@ -5,9 +5,7 @@ Support for loading and dumping N-dimensional arrays in LinkML.
 # Quick reference for common commands
 
 ```bash
-cd linkml-model
-poetry run gen-json-schema tests/input/examples/schema_definition-array-2.yaml
-poetry run gen-pydantic tests/input/examples/schema_definition-array-2.yaml
+poetry run gen-pydantic tests/input/temperature_schema.yaml > tests/array_classes_lol.py
 ```
 
 # Acknowledgements

diff --git a/docs/conf.py b/docs/conf.py
@@ -5,6 +5,7 @@
 
 import os
 from datetime import date
+
 from linkml_arrays import __version__
 
 # -- Project information -----------------------------------------------------
@@ -24,7 +25,7 @@
     "sphinx_rtd_theme",
     "sphinx_click",
     # "sphinx_autodoc_typehints",
-    "myst_parser"
+    "myst_parser",
 ]
 
 # generate autosummary pages

diff --git a/my_temperature.DaySeries.values.h5 b/my_temperature.DaySeries.values.h5
diff --git a/my_temperature.DaySeries.values.npy b/my_temperature.DaySeries.values.npy
diff --git a/my_temperature.LatitudeSeries.values.h5 b/my_temperature.LatitudeSeries.values.h5
diff --git a/my_temperature.LatitudeSeries.values.npy b/my_temperature.LatitudeSeries.values.npy
diff --git a/my_temperature.LongitudeSeries.values.h5 b/my_temperature.LongitudeSeries.values.h5
diff --git a/my_temperature.LongitudeSeries.values.npy b/my_temperature.LongitudeSeries.values.npy
diff --git a/my_temperature.TemperatureMatrix.values.h5 b/my_temperature.TemperatureMatrix.values.h5
diff --git a/my_temperature.TemperatureMatrix.values.npy b/my_temperature.TemperatureMatrix.values.npy
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,49 +1,50 @@
 [tool.poetry]
 name = "linkml-arrays"
-version = "0.0.0"
+version = "0.1.0"
 description = "linkml-arrays"
-authors = ["Ryan Ly <[email protected]>"]
+authors = [
+    "Ryan Ly <[email protected]>",
+    "Chris Mungall <[email protected]>",
+]
 license = "BSD-3"
 readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-linkml-runtime = ">=1.7.0"
+linkml-runtime = ">=1.8.0"
 numpy = ">=1.24.3"
 h5py = ">=3.9.0"
 zarr = ">=2.16.1"
-nptyping = ">=2.5.0"
 xarray = "^2024.1.1"
-tox = "^3.25.1"  # TODO move out of main deps
+ruamel-yaml = "^0.18.6"
+importlib_metadata = "*"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.1.2"
-sphinx = {version = "^5.3.0", extras = ["docs"]}
-sphinx-rtd-theme = {version = "^1.0.0", extras = ["docs"]}
-# sphinx-autodoc-typehints = {version = "^1.19.4", extras = ["docs"]}
-sphinx-click = {version = "^4.3.0", extras = ["docs"]}
-myst-parser = {version = "^0.18.1", extras = ["docs"]}
-jupyter = {version = "*", extras = ["jupyter"]}
-
-[tool.poetry.scripts]
-linkml-arrays = "linkml_arrays.cli:main"
+pytest = "*"
+tox = "*"
+# sphinx = {version = "*", extras = ["docs"]}
+# sphinx-rtd-theme = {version = "^1.0.0", extras = ["docs"]}
+# # sphinx-autodoc-typehints = {version = "^1.19.4", extras = ["docs"]}
+# sphinx-click = {version = "^4.3.0", extras = ["docs"]}
+# myst-parser = {version = "*", extras = ["docs"]}
+# jupyter = {version = "*", extras = ["jupyter"]}
 
-[tool.poetry.extras]
-docs = [
-    "sphinx",
-    "sphinx-rtd-theme",
-    # "sphinx-autodoc-typehints",
-    "sphinx-click",
-    "myst-parser"
-]
-jupyter = [
-    "jupyter"
-]
+# [tool.poetry.extras]
+# docs = [
+#     "sphinx",
+#     "sphinx-rtd-theme",
+#     "sphinx-autodoc-typehints",
+#     "sphinx-click",
+#     "myst-parser"
+# ]
+# jupyter = [
+#     "jupyter"
+# ]
 
-[tool.poetry.group.dev.dependencies]
-black = "^24.1.1"
-pytest = "^7.1.2"
-mypy = "^1.8.0"
+# [tool.poetry.group.dev.dependencies]
+# black = "^24.1.1"
+# pytest = "^7.1.2"
+# mypy = "^1.8.0"
 
 [tool.poetry-dynamic-versioning]
 enable = true
@@ -52,7 +53,6 @@ style = "pep440"
 
 [tool.black]
 line-length = 100
-target-version = ["py38", "py39", "py310"]
 
 [tool.isort]
 profile = "black"

diff --git a/src/linkml_arrays/cli.py b/src/linkml_arrays/cli.py
diff --git a/src/linkml_arrays/dumpers/hdf5_dumper.py b/src/linkml_arrays/dumpers/hdf5_dumper.py
@@ -1,5 +1,6 @@
 """Class for dumping a LinkML model to an HDF5 file."""
 
+from pathlib import Path
 from typing import Union
 
 import h5py
@@ -14,15 +15,15 @@ def _iterate_element(
 ):
     """Recursively iterate through the elements of a LinkML model and save them.
 
-    Writes Pydantic BaseModel objects as groups, slots that implement "linkml:elements"
+    Write Pydantic BaseModel objects as groups, slots with the "array" element
     as datasets, and other slots as attributes.
     """
     # get the type of the element
     element_type = type(element).__name__
 
     for k, v in vars(element).items():
         found_slot = schemaview.induced_slot(k, element_type)
-        if "linkml:elements" in found_slot.implements:
+        if found_slot.array:
             # save the numpy array to an hdf5 dataset
             group.create_dataset(found_slot.name, data=v)
         else:
@@ -39,16 +40,13 @@ class Hdf5Dumper(Dumper):
     """Dumper class for LinkML models to HDF5 files."""
 
     # TODO is this the right method to overwrite? it does not dump a string
-    def dumps(self, element: Union[YAMLRoot, BaseModel], schemaview: SchemaView, **kwargs):
-        """Dump the element to an HDF5 file.
-
-        Raises:
-            ValueError: If the class requires an identifier and it is not provided.
-        """
-        id_slot = schemaview.get_identifier_slot(element.__class__.__name__)
-        if id_slot is None:
-            raise ValueError("The class requires an identifier.")
-        id_value = getattr(element, id_slot.name)
-        output_file_path = f"{id_value}.h5"
+    def dumps(
+        self,
+        element: Union[YAMLRoot, BaseModel],
+        schemaview: SchemaView,
+        output_file_path: Union[str, Path],
+        **kwargs,
+    ):
+        """Dump the element to an HDF5 file."""
         with h5py.File(output_file_path, "w") as f:
             _iterate_element(element, schemaview, f)
diff --git a/src/linkml_arrays/dumpers/yaml_array_file_dumper.py b/src/linkml_arrays/dumpers/yaml_array_file_dumper.py
@@ -0,0 +1,123 @@
+"""Base class for dumping a LinkML model to YAML with paths to files containing arrays."""
+
+import os
+from abc import ABCMeta, abstractmethod
+from collections.abc import Callable
+from pathlib import Path
+from typing import List, Optional, Union
+
+import numpy as np
+import yaml
+from linkml_runtime import SchemaView
+from linkml_runtime.dumpers.dumper_root import Dumper
+from linkml_runtime.utils.yamlutils import YAMLRoot
+from pydantic import BaseModel
+
+
+def _iterate_element(
+    element: Union[YAMLRoot, BaseModel],
+    schemaview: SchemaView,
+    output_dir: Path,
+    write_array: Callable,
+    format: str,
+    parent_identifier=None,
+    inlined_name=None,
+):
+    """Recursively iterate through the elements of a LinkML model and save them.
+
+    Return a dictionary with the same structure as the input element, but where the slots
+    with the "array" element are written to an array file and the paths to these
+    files are returned in the dictionary. The paths are relative to the output directory.
+
+    Raises:
+        ValueError: If the class requires an identifier and it is not provided.
+    """
+    # get the type of the element
+    element_type = type(element).__name__
+
+    # ask schemaview whether it has a class by this name
+    found_class = schemaview.get_class(element_type)
+
+    id_slot = schemaview.get_identifier_slot(found_class.name)
+    if id_slot is not None:
+        id_value = getattr(element, id_slot.name)
+    else:
+        id_value = None
+
+    ret_dict = dict()
+    for k, v in vars(element).items():
+        found_slot = schemaview.induced_slot(k, element_type)
+        if found_slot.array:
+            if id_slot is None and parent_identifier is None:
+                raise ValueError("The class requires an identifier.")
+
+            # determine the output file name without the suffix
+            if id_slot is not None:
+                output_file_name = f"{id_value}.{found_slot.name}"
+            elif inlined_name is not None:
+                output_file_name = f"{parent_identifier}.{inlined_name}.{found_slot.name}"
+            elif parent_identifier is not None:
+                output_file_name = f"{parent_identifier}.{found_slot.name}"
+            else:
+                output_file_name = f"{found_slot.name}"
+
+            # if output_dir is absolute, make it relative to current working directory
+            # and create the directory if it does not exist
+            if output_dir.is_absolute():
+                output_dir = Path(os.path.relpath(output_dir, start=os.getcwd()))
+            output_dir.mkdir(exist_ok=True)
+            output_file_path_no_suffix = output_dir / output_file_name
+
+            # save the numpy array to file and write the file path to the dictionary
+            output_file_path = write_array(v, output_file_path_no_suffix)
+            ret_dict[k] = {
+                "source": [
+                    {
+                        "file": f"./{output_file_path}",
+                        "format": format,
+                    }
+                ]
+            }
+        else:
+            if isinstance(v, BaseModel):
+                v2 = _iterate_element(
+                    v,
+                    schemaview,
+                    output_dir,
+                    write_array,
+                    format,
+                    id_value,
+                    inlined_name=found_slot.name,
+                )
+                ret_dict[k] = v2
+            else:
+                ret_dict[k] = v
+    return ret_dict
+
+
+class YamlArrayFileDumper(Dumper, metaclass=ABCMeta):
+    """Base dumper class for LinkML models to YAML files with paths to array files."""
+
+    # FORMAT is a class attribute that must be set by subclasses
+
+    def dumps(
+        self,
+        element: Union[YAMLRoot, BaseModel],
+        schemaview: SchemaView,
+        output_dir: Optional[Union[str, Path]] = None,
+        **kwargs,
+    ) -> str:
+        """Return element formatted as a YAML string."""
+        if output_dir is None:
+            output_dir = "."
+        input = _iterate_element(
+            element, schemaview, Path(output_dir), self.write_array, self.FORMAT
+        )
+
+        return yaml.dump(input)
+
+    @classmethod
+    @abstractmethod
+    def write_array(cls, array: Union[List, np.ndarray], output_file_path: Union[str, Path]):
+        """Write an array to a file."""
+        raise NotImplementedError("Subclasses must implement this method.")
diff --git a/src/linkml_arrays/dumpers/yaml_dumper.py b/src/linkml_arrays/dumpers/yaml_dumper.py
@@ -1,4 +1,4 @@
-"""Class for dumpling a LinkML model to a YAML file."""
+"""Class for dumping a LinkML model to YAML."""
 
 from typing import Union
 
@@ -14,8 +14,8 @@ def _iterate_element(
 ):
     """Recursively iterate through the elements of a LinkML model and save them.
 
-    Returns a dictionary with the same structure as the input element, but with the slots
-    that implement "linkml:elements" (arrays) are written as lists or lists of lists.
+    Returns a dictionary with the same structure as the input element, but where the slots
+    with the "array" element are written as lists of lists in YAML.
 
     Raises:
         ValueError: If the class requires an identifier and it is not provided.
@@ -35,10 +35,11 @@ def _iterate_element(
     ret_dict = dict()
     for k, v in vars(element).items():
         found_slot = schemaview.induced_slot(k, element_type)
-        if "linkml:elements" in found_slot.implements:
+        if found_slot.array:
             if id_slot is None and parent_identifier is None:
                 raise ValueError("The class requires an identifier.")
-            ret_dict[k] = v.tolist()
+            assert isinstance(v, list)
+            ret_dict[k] = v
         else:
             if isinstance(v, BaseModel):
                 v2 = _iterate_element(v, schemaview, id_value)