From 87c34b9890ac51bb9ad2beaf08986234981964a0 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Fri, 12 Jul 2024 16:11:21 -0700 Subject: [PATCH 1/2] initial koza split command seems to be working --- src/koza/cli_utils.py | 59 +++++++++++++++++++++++++++++++++++++++++++ src/koza/main.py | 10 +++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/koza/cli_utils.py b/src/koza/cli_utils.py index edc1393..01d2f4b 100644 --- a/src/koza/cli_utils.py +++ b/src/koza/cli_utils.py @@ -3,6 +3,7 @@ """ from pathlib import Path +import os from typing import Dict, Literal, Optional, Union import yaml @@ -126,6 +127,64 @@ def _check_row_count(type: Literal["node", "edge"]): _check_row_count("edge") +def split_file(file: str, + fields: str, + format: OutputFormat = OutputFormat.tsv, + output_dir: str = "./output"): + db = duckdb.connect(":memory:") + + #todo: validate that each of the fields is actually a column in the file + if format == OutputFormat.tsv: + read_file = f"read_csv('{file}')" + elif format == OutputFormat.json: + read_file = f"read_json('{file}')" + else: + raise ValueError(f"Format {format} not supported") + + values = db.execute(f'SELECT DISTINCT {fields} FROM {read_file};').fetchall() + keys = fields.split(',') + list_of_value_dicts = [dict(zip(keys, v)) for v in values] + + def clean_value_for_filename(value): + return value.replace("biolink:", "").replace(" ", "_").replace(":", "_") + + def generate_filename_from_row(row): + return "_".join([clean_value_for_filename(row[k]) for k in keys if row[k] is not None]) + + def get_filename_prefix(name): + # get just the filename part of the path + name = os.path.basename(name) + if name.endswith('_edges.tsv'): + return name[:-9] + elif name.endswith('_nodes.tsv'): + return name[:-9] + else: + raise ValueError(f"Unexpected file name {name}, not sure how to make am output prefix for it") + + def get_filename_suffix(name): + if name.endswith('_edges.tsv'): + return '_edges.tsv' + elif name.endswith('_nodes.tsv'): + return '_nodes.tsv' + else: + raise ValueError(f"Unexpected file name {name}, not sure how to make am output prefix for it") + + # create output/split if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + for row in list_of_value_dicts: + # export to a tsv file named with the values of the pivot fields + where_clause = ' AND '.join([f"{k} = '{row[k]}'" for k in keys]) + file_name = output_dir + "/" + get_filename_prefix(file) + generate_filename_from_row(row) + get_filename_suffix(file) + print(f"writing {file_name}") + db.execute(f""" + COPY ( + SELECT * FROM {read_file} + WHERE {where_clause} + ) TO '{file_name}' (HEADER, DELIMITER '\t'); + """) + + def validate_file( file: str, format: FormatType = FormatType.csv, diff --git a/src/koza/main.py b/src/koza/main.py index fe4008b..9997801 100755 --- a/src/koza/main.py +++ b/src/koza/main.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Optional -from koza.cli_utils import transform_source, validate_file +from koza.cli_utils import transform_source, validate_file, split_file from koza.model.config.source_config import FormatType, OutputFormat import typer @@ -65,6 +65,14 @@ def validate( """Validate a source file""" validate_file(file, format, delimiter, header_delimiter, skip_blank_lines) +@typer_app.command() +def split( + file: str = typer.Argument(..., help="Path to the source kgx file to be split"), + fields: str = typer.Argument(..., help="Comma separated list of fields to split on"), + output_dir: str = typer.Option(default="output", help="Path to output directory"), +): + """Split a file by fields""" + split_file(file, fields, output_dir=output_dir) if __name__ == "__main__": typer_app() From 8760f5162093edecf2fdd44b83e4051b5440131b Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Thu, 3 Oct 2024 13:13:16 -0700 Subject: [PATCH 2/2] Add option to split command that removes prefixes in filenames generated from field values --- src/koza/cli_utils.py | 4 ++++ src/koza/main.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/koza/cli_utils.py b/src/koza/cli_utils.py index 01d2f4b..e62b7e9 100644 --- a/src/koza/cli_utils.py +++ b/src/koza/cli_utils.py @@ -130,6 +130,7 @@ def _check_row_count(type: Literal["node", "edge"]): def split_file(file: str, fields: str, format: OutputFormat = OutputFormat.tsv, + remove_prefixes: bool = False, output_dir: str = "./output"): db = duckdb.connect(":memory:") @@ -146,6 +147,9 @@ def split_file(file: str, list_of_value_dicts = [dict(zip(keys, v)) for v in values] def clean_value_for_filename(value): + if remove_prefixes and ':' in value: + value = value.split(":")[-1] + return value.replace("biolink:", "").replace(" ", "_").replace(":", "_") def generate_filename_from_row(row): diff --git a/src/koza/main.py b/src/koza/main.py index 9997801..fa4a27b 100755 --- a/src/koza/main.py +++ b/src/koza/main.py @@ -69,10 +69,11 @@ def validate( def split( file: str = typer.Argument(..., help="Path to the source kgx file to be split"), fields: str = typer.Argument(..., help="Comma separated list of fields to split on"), + remove_prefixes: bool = typer.Option(False, help="Remove prefixes from the file names for values from the specified fields. (e.g, NCBIGene:9606 becomes 9606"), output_dir: str = typer.Option(default="output", help="Path to output directory"), ): """Split a file by fields""" - split_file(file, fields, output_dir=output_dir) + split_file(file, fields,remove_prefixes=remove_prefixes, output_dir=output_dir) if __name__ == "__main__": typer_app()