Skip to content

Commit

Permalink
Get MSDF from LinkML objects (#440)
Browse files Browse the repository at this point in the history
Closes #439

This PR adds three methods to the `MappingSetDataFrame` class that allow
instantiation from LinkML objects such as `MappingSetDocument`,
`MappingSet`, or a list of `Mapping`.

It has the benefit of pre-baking in logic for dealing with default
metadata and converters, meaning that nobody has to roll this themselves
downstream

This is going to help us address
https://github.com/INCATools/ontology-access-kit/blob/75940bfa883001afb0e4aebb339fd62583c13844/src/oaklib/utilities/lexical/lexical_indexer.py#L313-L325,
specifically in
INCATools/ontology-access-kit#664
  • Loading branch information
cthoyt authored Oct 5, 2023
1 parent 678ad51 commit 12c8b1a
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 27 deletions.
1 change: 1 addition & 0 deletions src/sssom/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from sssom_schema import Mapping, MappingSet, slots # noqa:401

from sssom.sssom_document import MappingSetDocument # noqa:401
from sssom.util import ( # noqa:401
MappingSetDataFrame,
collapse,
Expand Down
110 changes: 84 additions & 26 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging as _logging
import os
import re
from collections import defaultdict
from collections import ChainMap, defaultdict
from dataclasses import dataclass, field
from functools import lru_cache, partial, reduce
from pathlib import Path
Expand All @@ -21,7 +21,7 @@
from jsonschema import ValidationError
from linkml_runtime.linkml_model.types import Uriorcurie
from sssom_schema import Mapping as SSSOM_Mapping
from sssom_schema import slots
from sssom_schema import MappingSet, slots

from .constants import (
COLUMN_INVERT_DICTIONARY,
Expand Down Expand Up @@ -59,7 +59,13 @@
UNKNOWN_IRI,
SSSOMSchemaView,
)
from .context import SSSOM_BUILT_IN_PREFIXES, _get_built_in_prefix_map, get_converter
from .context import (
HINT,
SSSOM_BUILT_IN_PREFIXES,
_get_built_in_prefix_map,
ensure_converter,
get_converter,
)
from .sssom_document import MappingSetDocument
from .typehints import MetadataType, PrefixMap, get_default_metadata

Expand Down Expand Up @@ -105,6 +111,79 @@ def with_converter(
metadata=metadata or get_default_metadata(),
)

@classmethod
def from_mappings(
cls,
mappings: List[SSSOM_Mapping],
*,
converter: HINT = None,
metadata: Optional[MetadataType] = None,
) -> "MappingSetDataFrame":
"""Instantiate from a list of mappings, mapping set metadata, and an optional converter."""
# This combines multiple pieces of metadata in the following priority order:
# 1. The explicitly given metadata passed to from_mappings()
# 2. The default metadata (which includes a dummy license and mapping set URI)
chained_metadata = ChainMap(
metadata or {},
get_default_metadata(),
)
mapping_set = MappingSet(mappings=mappings, **chained_metadata)
return cls.from_mapping_set(mapping_set=mapping_set, converter=converter)

@classmethod
def from_mapping_set(
cls, mapping_set: MappingSet, *, converter: HINT = None
) -> "MappingSetDataFrame":
"""Instantiate from a mapping set and an optional converter.
:param mapping_set: A mapping set
:param converter: A prefix map or pre-instantiated converter. If none given, uses a default
prefix map derived from the Bioregistry.
:returns: A mapping set dataframe
"""
doc = MappingSetDocument(converter=ensure_converter(converter), mapping_set=mapping_set)
return cls.from_mapping_set_document(doc)

@classmethod
def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFrame":
"""Instantiate from a mapping set document."""
if doc.mapping_set.mappings is None:
return cls(df=pd.DataFrame(), converter=doc.converter)

df = pd.DataFrame(get_dict_from_mapping(mapping) for mapping in doc.mapping_set.mappings)
meta = extract_global_metadata(doc)
meta.pop(PREFIX_MAP_KEY, None)

# remove columns where all values are blank.
df.replace("", np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s.

slots_with_double_as_range = {
slot
for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items()
if slot_metadata["range"] == "double"
}
non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
non_double_cols = non_double_cols.replace(np.nan, "")
df[non_double_cols.columns] = non_double_cols

df = sort_df_rows_columns(df)
return cls.with_converter(df=df, converter=doc.converter, metadata=meta)

def to_mapping_set_document(self) -> "MappingSetDocument":
"""Get a mapping set document."""
from .parsers import to_mapping_set_document

return to_mapping_set_document(self)

def to_mapping_set(self) -> MappingSet:
"""Get a mapping set."""
return self.to_mapping_set_document().mapping_set

def to_mappings(self) -> List[SSSOM_Mapping]:
"""Get a mapping set."""
return self.to_mapping_set().mappings

def clean_context(self) -> None:
"""Clean up the context."""
self.converter = curies.chain([_get_built_in_prefix_map(), self.converter])
Expand Down Expand Up @@ -948,6 +1027,7 @@ def extract_global_metadata(msdoc: MappingSetDocument) -> Dict[str, PrefixMap]:
:param msdoc: MappingSetDocument object
:return: Dictionary containing metadata
"""
# TODO mark as private
meta = {PREFIX_MAP_KEY: msdoc.prefix_map}
ms_meta = msdoc.mapping_set
for key in [
Expand All @@ -969,29 +1049,7 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
:param doc: MappingSetDocument object
:return: MappingSetDataFrame object
"""
data = []
slots_with_double_as_range = [
s
for s in _get_sssom_schema_object().dict["slots"].keys()
if _get_sssom_schema_object().dict["slots"][s]["range"] == "double"
]
if doc.mapping_set.mappings is not None:
for mapping in doc.mapping_set.mappings:
m = get_dict_from_mapping(mapping)
data.append(m)
df = pd.DataFrame(data=data)
meta = extract_global_metadata(doc)
meta.pop(PREFIX_MAP_KEY, None)
# The following 3 lines are to remove columns
# where all values are blank.
df.replace("", np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s.
non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
non_double_cols = non_double_cols.replace(np.nan, "")
df[non_double_cols.columns] = non_double_cols
msdf = MappingSetDataFrame.with_converter(df=df, converter=doc.converter, metadata=meta)
msdf.df = sort_df_rows_columns(msdf.df)
return msdf
return MappingSetDataFrame.from_mapping_set_document(doc)


def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> dict:
Expand Down
43 changes: 42 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from curies import Converter, Record

from sssom.constants import OBJECT_ID, SUBJECT_ID
from sssom.context import SSSOM_BUILT_IN_PREFIXES
from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
from sssom.io import extract_iri
from sssom.parsers import parse_sssom_table
from sssom.util import (
Expand Down Expand Up @@ -302,3 +302,44 @@ def test_standardize_metadata_raise_on_missing(self):
with self.assertLogs("sssom.util") as cm:
msdf._standardize_metadata_references()
self.assertIn("invalid metadata key xxxx", "".join(cm.output))

def test_msdf_from_mappings(self):
"""Test round tripping to SSSOM classes."""
rows = [
(
"DOID:0050601",
"ADULT syndrome",
"skos:exactMatch",
"UMLS:C1863204",
"ADULT SYNDROME",
"semapv:ManualMappingCuration",
"orcid:0000-0003-4423-4370",
)
]
columns = [
"subject_id",
"subject_label",
"predicate_id",
"object_id",
"object_label",
"mapping_justification",
"creator_id",
]
df = pd.DataFrame(rows, columns=columns)
msdf = MappingSetDataFrame(df=df, converter=ensure_converter())
msdf.clean_prefix_map(strict=True)

msd = msdf.to_mapping_set_document()

new_msdf = MappingSetDataFrame.from_mappings(
mappings=msd.mapping_set.mappings,
converter=msd.converter,
metadata={
"license": msdf.metadata["license"],
"mapping_set_id": msdf.metadata["mapping_set_id"],
},
)

self.assertEqual(1, len(new_msdf.df.index))
self.assertEqual(rows[0], tuple(msdf.df.iloc[0]))
self.assertEqual(new_msdf.metadata, msdf.metadata)

0 comments on commit 12c8b1a

Please sign in to comment.