Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to work with new array schema and LoL pydantic generator #14

Merged
merged 20 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,6 @@ dmypy.json
.pyre/

.DS_Store
/my_temperature.zarr
/my_temperature.h5
/out/*
/my_container.h5
/my_container.zarr
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@ Support for loading and dumping N-dimensional arrays in LinkML.
# Quick reference for common commands

```bash
cd linkml-model
poetry run gen-json-schema tests/input/examples/schema_definition-array-2.yaml
poetry run gen-pydantic tests/input/examples/schema_definition-array-2.yaml
poetry run gen-pydantic tests/input/temperature_schema.yaml > tests/array_classes_lol.py
```

# Acknowledgements
Expand Down
3 changes: 2 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import os
from datetime import date

from linkml_arrays import __version__

# -- Project information -----------------------------------------------------
Expand All @@ -24,7 +25,7 @@
"sphinx_rtd_theme",
"sphinx_click",
# "sphinx_autodoc_typehints",
"myst_parser"
"myst_parser",
]

# generate autosummary pages
Expand Down
Binary file removed my_temperature.DaySeries.values.h5
Binary file not shown.
Binary file removed my_temperature.DaySeries.values.npy
Binary file not shown.
Binary file removed my_temperature.LatitudeSeries.values.h5
Binary file not shown.
Binary file removed my_temperature.LatitudeSeries.values.npy
Binary file not shown.
Binary file removed my_temperature.LongitudeSeries.values.h5
Binary file not shown.
Binary file removed my_temperature.LongitudeSeries.values.npy
Binary file not shown.
Binary file removed my_temperature.TemperatureMatrix.values.h5
Binary file not shown.
Binary file removed my_temperature.TemperatureMatrix.values.npy
Binary file not shown.
3,627 changes: 780 additions & 2,847 deletions poetry.lock

Large diffs are not rendered by default.

62 changes: 31 additions & 31 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,49 +1,50 @@
[tool.poetry]
name = "linkml-arrays"
version = "0.0.0"
version = "0.1.0"
description = "linkml-arrays"
authors = ["Ryan Ly <[email protected]>"]
authors = [
"Ryan Ly <[email protected]>",
"Chris Mungall <[email protected]>",
]
license = "BSD-3"
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.9"
linkml-runtime = ">=1.7.0"
linkml-runtime = ">=1.8.0"
numpy = ">=1.24.3"
h5py = ">=3.9.0"
zarr = ">=2.16.1"
nptyping = ">=2.5.0"
xarray = "^2024.1.1"
tox = "^3.25.1" # TODO move out of main deps
ruamel-yaml = "^0.18.6"
importlib_metadata = "*"

[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
sphinx = {version = "^5.3.0", extras = ["docs"]}
sphinx-rtd-theme = {version = "^1.0.0", extras = ["docs"]}
# sphinx-autodoc-typehints = {version = "^1.19.4", extras = ["docs"]}
sphinx-click = {version = "^4.3.0", extras = ["docs"]}
myst-parser = {version = "^0.18.1", extras = ["docs"]}
jupyter = {version = "*", extras = ["jupyter"]}

[tool.poetry.scripts]
linkml-arrays = "linkml_arrays.cli:main"
pytest = "*"
tox = "*"
# sphinx = {version = "*", extras = ["docs"]}
# sphinx-rtd-theme = {version = "^1.0.0", extras = ["docs"]}
# # sphinx-autodoc-typehints = {version = "^1.19.4", extras = ["docs"]}
# sphinx-click = {version = "^4.3.0", extras = ["docs"]}
# myst-parser = {version = "*", extras = ["docs"]}
# jupyter = {version = "*", extras = ["jupyter"]}

[tool.poetry.extras]
docs = [
"sphinx",
"sphinx-rtd-theme",
# "sphinx-autodoc-typehints",
"sphinx-click",
"myst-parser"
]
jupyter = [
"jupyter"
]
# [tool.poetry.extras]
# docs = [
# "sphinx",
# "sphinx-rtd-theme",
# "sphinx-autodoc-typehints",
# "sphinx-click",
# "myst-parser"
# ]
# jupyter = [
# "jupyter"
# ]

[tool.poetry.group.dev.dependencies]
black = "^24.1.1"
pytest = "^7.1.2"
mypy = "^1.8.0"
# [tool.poetry.group.dev.dependencies]
# black = "^24.1.1"
# pytest = "^7.1.2"
# mypy = "^1.8.0"

[tool.poetry-dynamic-versioning]
enable = true
Expand All @@ -52,7 +53,6 @@ style = "pep440"

[tool.black]
line-length = 100
target-version = ["py38", "py39", "py310"]

[tool.isort]
profile = "black"
Expand Down
44 changes: 0 additions & 44 deletions src/linkml_arrays/cli.py

This file was deleted.

24 changes: 11 additions & 13 deletions src/linkml_arrays/dumpers/hdf5_dumper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Class for dumping a LinkML model to an HDF5 file."""

from pathlib import Path
from typing import Union

import h5py
Expand All @@ -14,15 +15,15 @@ def _iterate_element(
):
"""Recursively iterate through the elements of a LinkML model and save them.

Writes Pydantic BaseModel objects as groups, slots that implement "linkml:elements"
Write Pydantic BaseModel objects as groups, slots with the "array" element
as datasets, and other slots as attributes.
"""
# get the type of the element
element_type = type(element).__name__

for k, v in vars(element).items():
found_slot = schemaview.induced_slot(k, element_type)
if "linkml:elements" in found_slot.implements:
if found_slot.array:
# save the numpy array to an hdf5 dataset
group.create_dataset(found_slot.name, data=v)
else:
Expand All @@ -39,16 +40,13 @@ class Hdf5Dumper(Dumper):
"""Dumper class for LinkML models to HDF5 files."""

# TODO is this the right method to overwrite? it does not dump a string
def dumps(self, element: Union[YAMLRoot, BaseModel], schemaview: SchemaView, **kwargs):
"""Dump the element to an HDF5 file.

Raises:
ValueError: If the class requires an identifier and it is not provided.
"""
id_slot = schemaview.get_identifier_slot(element.__class__.__name__)
if id_slot is None:
raise ValueError("The class requires an identifier.")
id_value = getattr(element, id_slot.name)
output_file_path = f"{id_value}.h5"
def dumps(
self,
element: Union[YAMLRoot, BaseModel],
schemaview: SchemaView,
output_file_path: Union[str, Path],
**kwargs,
):
"""Dump the element to an HDF5 file."""
with h5py.File(output_file_path, "w") as f:
_iterate_element(element, schemaview, f)
123 changes: 123 additions & 0 deletions src/linkml_arrays/dumpers/yaml_array_file_dumper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Base class for dumping a LinkML model to YAML with paths to files containing arrays."""

import os
from abc import ABCMeta, abstractmethod
from collections.abc import Callable
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
import yaml
from linkml_runtime import SchemaView
from linkml_runtime.dumpers.dumper_root import Dumper
from linkml_runtime.utils.yamlutils import YAMLRoot
from pydantic import BaseModel


def _iterate_element(
element: Union[YAMLRoot, BaseModel],
schemaview: SchemaView,
output_dir: Path,
write_array: Callable,
format: str,
parent_identifier=None,
inlined_name=None,
):
"""Recursively iterate through the elements of a LinkML model and save them.

Return a dictionary with the same structure as the input element, but where the slots
with the "array" element are written to an array file and the paths to these
files are returned in the dictionary. The paths are relative to the output directory.

Raises:
ValueError: If the class requires an identifier and it is not provided.
"""
# get the type of the element
element_type = type(element).__name__

# ask schemaview whether it has a class by this name
found_class = schemaview.get_class(element_type)

id_slot = schemaview.get_identifier_slot(found_class.name)
if id_slot is not None:
id_value = getattr(element, id_slot.name)
else:
id_value = None

ret_dict = dict()
for k, v in vars(element).items():
found_slot = schemaview.induced_slot(k, element_type)
if found_slot.array:
if id_slot is None and parent_identifier is None:
raise ValueError("The class requires an identifier.")

# determine the output file name without the suffix
if id_slot is not None:
output_file_name = f"{id_value}.{found_slot.name}"
elif inlined_name is not None:
output_file_name = f"{parent_identifier}.{inlined_name}.{found_slot.name}"
elif parent_identifier is not None:
output_file_name = f"{parent_identifier}.{found_slot.name}"
else:
output_file_name = f"{found_slot.name}"

# if output_dir is absolute, make it relative to current working directory
# and create the directory if it does not exist
if output_dir.is_absolute():
output_dir = Path(os.path.relpath(output_dir, start=os.getcwd()))
output_dir.mkdir(exist_ok=True)
output_file_path_no_suffix = output_dir / output_file_name

# save the numpy array to file and write the file path to the dictionary
output_file_path = write_array(v, output_file_path_no_suffix)
ret_dict[k] = {
"source": [
{
"file": f"./{output_file_path}",
"format": format,
}
]
}
else:
if isinstance(v, BaseModel):
v2 = _iterate_element(
v,
schemaview,
output_dir,
write_array,
format,
id_value,
inlined_name=found_slot.name,
)
ret_dict[k] = v2
else:
ret_dict[k] = v
return ret_dict


class YamlArrayFileDumper(Dumper, metaclass=ABCMeta):
"""Base dumper class for LinkML models to YAML files with paths to array files."""

# FORMAT is a class attribute that must be set by subclasses

def dumps(
self,
element: Union[YAMLRoot, BaseModel],
schemaview: SchemaView,
output_dir: Optional[Union[str, Path]] = None,
**kwargs,
) -> str:
"""Return element formatted as a YAML string."""
if output_dir is None:
output_dir = "."
input = _iterate_element(
element, schemaview, Path(output_dir), self.write_array, self.FORMAT
)

return yaml.dump(input)

@classmethod
@abstractmethod
def write_array(cls, array: Union[List, np.ndarray], output_file_path: Union[str, Path]):
"""Write an array to a file."""
raise NotImplementedError("Subclasses must implement this method.")
11 changes: 6 additions & 5 deletions src/linkml_arrays/dumpers/yaml_dumper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Class for dumpling a LinkML model to a YAML file."""
"""Class for dumping a LinkML model to YAML."""

from typing import Union

Expand All @@ -14,8 +14,8 @@ def _iterate_element(
):
"""Recursively iterate through the elements of a LinkML model and save them.

Returns a dictionary with the same structure as the input element, but with the slots
that implement "linkml:elements" (arrays) are written as lists or lists of lists.
Returns a dictionary with the same structure as the input element, but where the slots
with the "array" element are written as lists of lists in YAML.

Raises:
ValueError: If the class requires an identifier and it is not provided.
Expand All @@ -35,10 +35,11 @@ def _iterate_element(
ret_dict = dict()
for k, v in vars(element).items():
found_slot = schemaview.induced_slot(k, element_type)
if "linkml:elements" in found_slot.implements:
if found_slot.array:
if id_slot is None and parent_identifier is None:
raise ValueError("The class requires an identifier.")
ret_dict[k] = v.tolist()
assert isinstance(v, list)
ret_dict[k] = v
else:
if isinstance(v, BaseModel):
v2 = _iterate_element(v, schemaview, id_value)
Expand Down
Loading
Loading