Skip to content

Commit

Permalink
Support AID columns in Python wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
edongashi committed Jul 31, 2023
1 parent 61cbba7 commit a604b2c
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

### Version 1.1.0

- Added `syndiffix.py` python wrapper for ML feature selection.
- Lowered default thresholds for range and singularity nodes to 15 and 5.
- Added Python wrapper for auto-detecting column types and main features for ML.
- Lowered default thresholds for range and singularity nodes and raised default tree depth limit.
- Improved clustering algorithm for main column.
- Added `--output` (`-o`) CLI argument to directly save the CSV file to disk.
- Added `--clustering-mainfeatures <features>` CLI argument to specify main column's ML features.
Expand Down
20 changes: 19 additions & 1 deletion syndiffix.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,21 @@ def columns_metadata(df):
return columns


def process_aid_columns(arg):
if isinstance(arg, list):
return arg
elif isinstance(arg, tuple):
return list(arg)
elif isinstance(arg, str):
return [arg]
else:
return []


def main(
input_path: str,
output_path: str,
aid_columns: list[str] = [],
ml_target: str = None,
ml_features_only: bool = False,
syndiffix_args: str = '',
Expand All @@ -333,6 +345,7 @@ def main(
Parameters:
input_path: Path of input CSV file.
output_path: Path of output CSV file.
aid_columns: Entity identifier columns. If not specified, assumes one row per entity.
ml_target: If specified, focuses on this column for better ML prediction.
ml_features_only: If set, limits columns to only ML features of ml_target.
syndiffix_args: Extra arguments to pass to syndiffix.
Expand All @@ -347,14 +360,19 @@ def main(

extra_args = []

aid_columns = process_aid_columns(aid_columns)
if len(aid_columns) > 0:
print(f'AID Columns: {aid_columns}')
extra_args += ['--aidcolumns', *aid_columns]

if ml_target:
print('ML Target: ' + ml_target)

print('Selecting ML features...')
features = select_features_ml(df, ml_target)['kFeatures']
print('ML Features: ' + (', '.join(features)))

extra_args = [
extra_args += [
'--clustering-maincolumn', ml_target,
'--clustering-mainfeatures', *features
]
Expand Down

0 comments on commit a604b2c

Please sign in to comment.