From 87c34b9890ac51bb9ad2beaf08986234981964a0 Mon Sep 17 00:00:00 2001
From: Kevin Schaper <kevinschaper@gmail.com>
Date: Fri, 12 Jul 2024 16:11:21 -0700
Subject: [PATCH 1/2] initial koza split command seems to be working

---
 src/koza/cli_utils.py | 59 +++++++++++++++++++++++++++++++++++++++++++
 src/koza/main.py      | 10 +++++++-
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/koza/cli_utils.py b/src/koza/cli_utils.py
index edc1393..01d2f4b 100644
--- a/src/koza/cli_utils.py
+++ b/src/koza/cli_utils.py
@@ -3,6 +3,7 @@
 """
 
 from pathlib import Path
+import os
 from typing import Dict, Literal, Optional, Union
 import yaml
 
@@ -126,6 +127,64 @@ def _check_row_count(type: Literal["node", "edge"]):
         _check_row_count("edge")
 
 
+def split_file(file: str,
+               fields: str,
+               format: OutputFormat = OutputFormat.tsv,
+               output_dir: str = "./output"):
+    db = duckdb.connect(":memory:")
+
+    #todo: validate that each of the fields is actually a column in the file
+    if format == OutputFormat.tsv:
+        read_file = f"read_csv('{file}')"
+    elif format == OutputFormat.json:
+        read_file = f"read_json('{file}')"
+    else:
+        raise ValueError(f"Format {format} not supported")
+
+    values = db.execute(f'SELECT DISTINCT {fields} FROM {read_file};').fetchall()
+    keys = fields.split(',')
+    list_of_value_dicts = [dict(zip(keys, v)) for v in values]
+
+    def clean_value_for_filename(value):
+        return value.replace("biolink:", "").replace(" ", "_").replace(":", "_")
+
+    def generate_filename_from_row(row):
+        return "_".join([clean_value_for_filename(row[k]) for k in keys if row[k] is not None])
+
+    def get_filename_prefix(name):
+        # get just the filename part of the path
+        name = os.path.basename(name)
+        if name.endswith('_edges.tsv'):
+            return name[:-9]
+        elif name.endswith('_nodes.tsv'):
+            return name[:-9]
+        else:
+            raise ValueError(f"Unexpected file name {name}, not sure how to make am output prefix for it")
+
+    def get_filename_suffix(name):
+        if name.endswith('_edges.tsv'):
+            return '_edges.tsv'
+        elif name.endswith('_nodes.tsv'):
+            return '_nodes.tsv'
+        else:
+            raise ValueError(f"Unexpected file name {name}, not sure how to make am output prefix for it")
+
+    # create output/split if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    for row in list_of_value_dicts:
+        # export to a tsv file named with the values of the pivot fields
+        where_clause = ' AND '.join([f"{k} = '{row[k]}'" for k in keys])
+        file_name = output_dir + "/" + get_filename_prefix(file) +  generate_filename_from_row(row) + get_filename_suffix(file)
+        print(f"writing {file_name}")
+        db.execute(f"""
+        COPY (
+            SELECT * FROM {read_file}
+            WHERE {where_clause}
+        ) TO '{file_name}' (HEADER, DELIMITER '\t');
+        """)
+
+
 def validate_file(
     file: str,
     format: FormatType = FormatType.csv,
diff --git a/src/koza/main.py b/src/koza/main.py
index fe4008b..9997801 100755
--- a/src/koza/main.py
+++ b/src/koza/main.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Optional
 
-from koza.cli_utils import transform_source, validate_file
+from koza.cli_utils import transform_source, validate_file, split_file
 from koza.model.config.source_config import FormatType, OutputFormat
 
 import typer
@@ -65,6 +65,14 @@ def validate(
     """Validate a source file"""
     validate_file(file, format, delimiter, header_delimiter, skip_blank_lines)
 
+@typer_app.command()
+def split(
+    file: str = typer.Argument(..., help="Path to the source kgx file to be split"),
+    fields: str = typer.Argument(..., help="Comma separated list of fields to split on"),
+    output_dir: str = typer.Option(default="output", help="Path to output directory"),
+):
+    """Split a file by fields"""
+    split_file(file, fields, output_dir=output_dir)
 
 if __name__ == "__main__":
     typer_app()

From 8760f5162093edecf2fdd44b83e4051b5440131b Mon Sep 17 00:00:00 2001
From: Kevin Schaper <kevinschaper@gmail.com>
Date: Thu, 3 Oct 2024 13:13:16 -0700
Subject: [PATCH 2/2] Add option to split command that removes prefixes in
 filenames generated from field values

---
 src/koza/cli_utils.py | 4 ++++
 src/koza/main.py      | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/koza/cli_utils.py b/src/koza/cli_utils.py
index 01d2f4b..e62b7e9 100644
--- a/src/koza/cli_utils.py
+++ b/src/koza/cli_utils.py
@@ -130,6 +130,7 @@ def _check_row_count(type: Literal["node", "edge"]):
 def split_file(file: str,
                fields: str,
                format: OutputFormat = OutputFormat.tsv,
+               remove_prefixes: bool = False,
                output_dir: str = "./output"):
     db = duckdb.connect(":memory:")
 
@@ -146,6 +147,9 @@ def split_file(file: str,
     list_of_value_dicts = [dict(zip(keys, v)) for v in values]
 
     def clean_value_for_filename(value):
+        if remove_prefixes and ':' in value:
+            value = value.split(":")[-1]
+
         return value.replace("biolink:", "").replace(" ", "_").replace(":", "_")
 
     def generate_filename_from_row(row):
diff --git a/src/koza/main.py b/src/koza/main.py
index 9997801..fa4a27b 100755
--- a/src/koza/main.py
+++ b/src/koza/main.py
@@ -69,10 +69,11 @@ def validate(
 def split(
     file: str = typer.Argument(..., help="Path to the source kgx file to be split"),
     fields: str = typer.Argument(..., help="Comma separated list of fields to split on"),
+    remove_prefixes: bool = typer.Option(False, help="Remove prefixes from the file names for values from the specified fields. (e.g, NCBIGene:9606 becomes 9606"),
     output_dir: str = typer.Option(default="output", help="Path to output directory"),
 ):
     """Split a file by fields"""
-    split_file(file, fields, output_dir=output_dir)
+    split_file(file, fields,remove_prefixes=remove_prefixes, output_dir=output_dir)
 
 if __name__ == "__main__":
     typer_app()