Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ancestor enrichment #37

Merged
merged 13 commits into from
Sep 28, 2023
120 changes: 91 additions & 29 deletions pandasaurus/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandasaurus.utils.pandasaurus_exceptions import InvalidTerm, ObsoletedTerm
from pandasaurus.utils.query_utils import chunks, run_sparql_query
from pandasaurus.utils.sparql_queries import (
get_ancestor_enrichment_query,
get_contextual_enrichment_query,
get_full_enrichment_query,
get_most_specific_objects_query,
Expand Down Expand Up @@ -47,15 +48,15 @@ def __init__(

"""
# Might be unnecessary
self.__seed_list = seed_list
self.__enrichment_property_list = enrichment_property_list if enrichment_property_list else ["rdfs:subClassOf"]
self.__term_list: List[Term] = CurieValidator.construct_term_list(seed_list)
self._seed_list = seed_list
self._enrichment_property_list = enrichment_property_list if enrichment_property_list else ["rdfs:subClassOf"]
self._term_list: List[Term] = CurieValidator.construct_term_list(seed_list)
self.enriched_df = pd.DataFrame()
self.graph_df = pd.DataFrame()
self.graph = Graph()
# Validation and reporting
try:
CurieValidator.get_validation_report(self.__term_list)
CurieValidator.get_validation_report(self._term_list)
except InvalidTerm as e:
print(e.message)
if force_fail:
Expand All @@ -76,9 +77,9 @@ def simple_enrichment(self) -> pd.DataFrame:
Enriched DataFrame

"""
source_list = [term.get_iri() for term in self.__term_list]
object_list = [term.get_iri() for term in self.__term_list]
query_string = get_simple_enrichment_query(source_list, object_list, self.__enrichment_property_list)
source_list = [term.get_iri() for term in self._term_list]
object_list = [term.get_iri() for term in self._term_list]
query_string = get_simple_enrichment_query(source_list, object_list, self._enrichment_property_list)
self.enriched_df = (
pd.DataFrame(
[res for res in run_sparql_query(query_string)],
Expand All @@ -87,9 +88,7 @@ def simple_enrichment(self) -> pd.DataFrame:
.sort_values("s")
.reset_index(drop=True)
)
self.mirror_enrichment_for_graph_generation(object_list)
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.graph_df["p"].unique().tolist())
self._generate_enrichment_graph(object_list)

return self.enriched_df

Expand All @@ -105,15 +104,15 @@ def minimal_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
Enriched DataFrame

"""
source_list = [term.get_iri() for term in self.__term_list]
source_list = [term.get_iri() for term in self._term_list]
object_list = source_list + SlimManager.get_slim_members(slim_list)
s_result = []
for chunk in chunks(object_list, 90):
s_result.extend(
[
res
for res in run_sparql_query(
get_simple_enrichment_query(source_list, chunk, self.__enrichment_property_list)
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
)
]
)
Expand All @@ -122,9 +121,7 @@ def minimal_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
.sort_values("s")
.reset_index(drop=True)
)
self.mirror_enrichment_for_graph_generation(object_list)
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
self._generate_enrichment_graph(object_list)

return self.enriched_df

Expand All @@ -141,7 +138,7 @@ def full_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
Enriched DataFrame

"""
source_list = [term.get_iri() for term in self.__term_list]
source_list = [term.get_iri() for term in self._term_list]
object_list = source_list + SlimManager.get_slim_members(slim_list)
s_result = []
for chunk in chunks(object_list, 90):
Expand All @@ -154,9 +151,7 @@ def full_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
.sort_values("s")
.reset_index(drop=True)
)
self.mirror_enrichment_for_graph_generation(object_list)
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
self._generate_enrichment_graph(object_list)

return self.enriched_df

Expand All @@ -175,15 +170,15 @@ def contextual_slim_enrichment(self, context: List[str]) -> pd.DataFrame:
"""
# TODO add a curie checking mechanism for context list
query_string = get_contextual_enrichment_query(context)
source_list = [term.get_iri() for term in self.__term_list]
source_list = [term.get_iri() for term in self._term_list]
object_list = source_list + [res.get("term") for res in run_sparql_query(query_string)]
s_result = []
for chunk in chunks(object_list, 90):
s_result.extend(
[
res
for res in run_sparql_query(
get_simple_enrichment_query(source_list, chunk, self.__enrichment_property_list)
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
)
]
)
Expand All @@ -193,12 +188,73 @@ def contextual_slim_enrichment(self, context: List[str]) -> pd.DataFrame:
.sort_values("s")
.reset_index(drop=True)
)
self.mirror_enrichment_for_graph_generation(object_list)
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
self._generate_enrichment_graph(object_list)

return self.enriched_df

def ancestor_enrichment(self, step_count: str) -> pd.DataFrame:
"""
Perform ancestor enrichment analysis with a specified number of hops.

Args:
step_count (str): The number of hops to consider when enriching terms.

Returns:
pd.DataFrame: A DataFrame containing enriched terms and associated information.

This method conducts an ancestor enrichment analysis on a set of seed terms,
considering the specified number of hops in the ontology graph. The analysis
retrieves terms that are ancestors of the seed terms within the specified
number of hops and compiles the results into a DataFrame.

The `step_count` parameter controls the depth of the analysis. A smaller
`step_count` limits the analysis to immediate ancestors, while a larger value
includes more distant ancestors.

"""
source_list = [term.get_iri() for term in self._term_list]
query_string = get_ancestor_enrichment_query(source_list, step_count)
object_list = list(set(uri for res in run_sparql_query(query_string) for uri in res.values()))
s_result = []
for chunk in chunks(object_list, 90):
s_result.extend(
[
res
for res in run_sparql_query(
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
)
]
)

self.enriched_df = (
pd.DataFrame(s_result, columns=["s", "s_label", "p", "o", "o_label"])
.sort_values("s")
.reset_index(drop=True)
)
self._generate_enrichment_graph(object_list)

return self.enriched_df

def parent_enrichment(self):
"""
Perform parent enrichment analysis.

This method is a convenience wrapper around the `ancestor_enrichment` method,
specifically designed to perform parent enrichment analysis. Parent enrichment
analysis considers only immediate parent terms of the seed terms in the ontology
graph (i.e., one-hop ancestors).

Returns:
pd.DataFrame: A DataFrame containing enriched parent terms and associated
information.

This method simplifies the process of conducting parent enrichment analysis by
calling the `ancestor_enrichment` method with a `step_count` of 1, which limits
the analysis to immediate parent terms of the seed terms.

"""
self.ancestor_enrichment(1)

def synonym_lookup(self) -> pd.DataFrame:
"""

Expand All @@ -207,7 +263,7 @@ def synonym_lookup(self) -> pd.DataFrame:

"""
label_df = pd.DataFrame(
{term.get_iri(): term.get_label() for term in self.__term_list}.items(), columns=["ID", "label"]
{term.get_iri(): term.get_label() for term in self._term_list}.items(), columns=["ID", "label"]
)

synonym_query_results = run_sparql_query(get_synonym_query(label_df["ID"].tolist()))
Expand Down Expand Up @@ -242,7 +298,7 @@ def get_most_specific_objects(self, predicate: str, ontology: str):
Returns:

"""
subject_list = [term.get_iri() for term in self.__term_list]
subject_list = [term.get_iri() for term in self._term_list]
query_string = get_most_specific_objects_query(subject_list, predicate, ontology)
return (
pd.DataFrame(
Expand All @@ -268,7 +324,7 @@ def get_most_specific_subjects(self, predicate: str, ontology: str):
Returns:

"""
object_list = [term.get_iri() for term in self.__term_list]
object_list = [term.get_iri() for term in self._term_list]
query_string = get_most_specific_subjects_query(object_list, predicate, ontology)
return (
pd.DataFrame(
Expand Down Expand Up @@ -297,7 +353,7 @@ def query(self, column_name: str, query_term: str) -> pd.DataFrame:

def update_obsoleted_terms(self):
"""Replaces all obsoleted terms in the term list with the new term that obsoletes them."""
[getattr(term, "update_obsoleted_term")() for term in self.__term_list]
[getattr(term, "update_obsoleted_term")() for term in self._term_list]

def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
# TODO definitely need a refactoring later on
Expand All @@ -308,7 +364,7 @@ def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
[
res
for res in run_sparql_query(
get_simple_enrichment_query(s_chunk, o_chunk, self.__enrichment_property_list)
get_simple_enrichment_query(s_chunk, o_chunk, self._enrichment_property_list)
)
]
)
Expand All @@ -317,3 +373,9 @@ def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
.sort_values("s")
.reset_index(drop=True)
)

def _generate_enrichment_graph(self, object_list):
self.mirror_enrichment_for_graph_generation(object_list)
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
# self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.graph_df["p"].unique().tolist())ş
22 changes: 22 additions & 0 deletions pandasaurus/utils/sparql_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,28 @@ def get_label_query(term_iri_list: List[str]) -> str:
)


def get_ancestor_enrichment_query(seed_list: List[str], step_count) -> str:
query = "SELECT * WHERE { GRAPH <http://reasoner.renci.org/nonredundant> "
query += f"{{ VALUES ?s {{{' '.join(seed_list)} }} "
query += "?s rdfs:subClassOf ?o0. "

defined_by = (
"GRAPH <http://reasoner.renci.org/ontology> { ?o0 rdfs:isDefinedBy " "<http://purl.obolibrary.org/obo/cl.owl>. "
)

# Build the nested OPTIONAL blocks for the specified number of steps
for step in range(step_count):
if step < step_count - 1:
query += f"OPTIONAL {{ ?o{step} rdfs:subClassOf ?o{step + 1}.}} "
defined_by += f"?o{step + 1} rdfs:isDefinedBy <http://purl.obolibrary.org/obo/cl.owl>. "

query += "} "
query += defined_by
query += "}} #LIMIT"

return query


def get_synonym_query(term_iri_list: List[str]) -> str:
return (
f"SELECT * WHERE {{VALUES ?s {{ {' '.join(term_iri_list)} }}"
Expand Down
Loading
Loading