From 506ae00065dd658b03e93496660d6256a35edab5 Mon Sep 17 00:00:00 2001 From: flywire Date: Wed, 23 Jun 2021 21:05:18 +1000 Subject: [PATCH 01/16] Add parsing options * clean_prestrip - Remove key column transactions using value * format_currency_number - Convert columns to numeric * clean_unwrap - Key column and value column to unwrap * clean_case - Change to Title Case * support for regex * clean_date customised for cba dd mmm format. * logging added --- pdf_statement_reader/parse.py | 103 +++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 5a5ac10..375a945 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -2,6 +2,8 @@ from pikepdf import Pdf import pandas as pd import numpy as np +import re +import logging def get_raw_df(filename, num_pages, config): @@ -25,8 +27,8 @@ def get_raw_df(filename, num_pages, config): pandas_options={"dtype": str}, java_options=[ "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", - "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog" - ] + "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", + ], ) if df is not None and len(df) > 0: dfs.extend(df) @@ -34,6 +36,26 @@ def get_raw_df(filename, num_pages, config): return statement +def clean_prestrip(df, config): + key = config["columns"][config["cleaning"]["prestrip"][0]] + value = config["cleaning"]["prestrip"][1] + # df.ix[:, ~df[key].str.match(value)== True] + df = df[~df[key].str.match(value) == True] + return df + + +def format_currency_number(s): + decimal_separator = "." + re_real = "[^\d" + decimal_separator + "]+" + re_negative = "(^-|(?i)DR)|(-|(?i)DR$)" + s = str(s) + flag_negative = True if bool(re.search(re_negative, s)) else False + s = re.sub(re_real, "", s) + if flag_negative: + s = "-" + s + return s + + def format_negatives(s): s = str(s) if s.endswith("-"): @@ -45,29 +67,9 @@ def format_negatives(s): def clean_numeric(df, config): numeric_cols = [config["columns"][col] for col in config["cleaning"]["numeric"]] - for col in numeric_cols: - df[col] = df[col].apply(format_negatives) - df[col] = df[col].str.replace(" ", "") - df[col] = pd.to_numeric( - df[col], - errors="coerce" - ) - - -def clean_date(df, config): - date_cols = [config["columns"][col] for col in config["cleaning"]["date"]] - if "date_format" in config["cleaning"]: - date_format = config["cleaning"]["date_format"] - else: - date_format = None - - for col in date_cols: - df[col] = pd.to_datetime( - df[col], - errors="coerce", - format=date_format - ) + df[col] = df[col].apply(format_currency_number) + df[col] = pd.to_numeric(df[col], errors="coerce") def clean_trans_detail(df, config): @@ -84,6 +86,40 @@ def clean_trans_detail(df, config): df.loc[i - 1, trans_detail] = row[trans_type] +def clean_unwrap(df, config): + key = config["columns"][config["cleaning"]["unwrap"][0]] + value = config["columns"][config["cleaning"]["unwrap"][1]] + j = 0 + for i, row in df.iterrows(): + if pd.isnull(row[key]): + df.loc[j, value] += " " + df.loc[i, value] + else: + j = i + + +def clean_date(df, config): + date_cols = [config["columns"][col] for col in config["cleaning"]["date"]] + if "date_format" in config["cleaning"]: + date_format = config["cleaning"]["date_format"] + else: + date_format = None + + year = df.iloc[0, 1][0:4] + for col in date_cols: + df[col] += " " + year + # print(df[col]) + df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format + " %Y") + # print(type(df)) + print(df) + + +def clean_case(df, config): + cols = [config["columns"][col] for col in config["cleaning"]["case"]] + for col in cols: + df[col] = df[col].str.title() + return df + + def clean_dropna(df, config): drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]] df.dropna(subset=drop_cols, inplace=True) @@ -95,20 +131,31 @@ def reorder_columns(df, config): def parse_statement(filename, config): + logging.basicConfig(level=logging.ERROR) + logger = logging.getLogger() pdf = Pdf.open(filename) num_pages = len(pdf.pages) statement = get_raw_df(filename, num_pages, config) + logging.debug(statement["Transaction"]) + + if "prestrip" in config["cleaning"]: + statement = clean_prestrip(statement, config) if "numeric" in config["cleaning"]: clean_numeric(statement, config) - if "trans_detail" in config["cleaning"]: - clean_trans_detail(statement, config) - + if "unwrap" in config["cleaning"]: + clean_unwrap(statement, config) + if "date" in config["cleaning"]: clean_date(statement, config) - + + if "case" in config["cleaning"]: + statement = clean_case(statement, config) + + print(statement.info()) + if "dropna" in config["cleaning"]: clean_dropna(statement, config) From 177fd1337b7a29907dcb6021c87628e5766202c7 Mon Sep 17 00:00:00 2001 From: flywire Date: Wed, 23 Jun 2021 21:07:46 +1000 Subject: [PATCH 02/16] Extended cba json config file --- pdf_statement_reader/config/cba/saving.json | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 pdf_statement_reader/config/cba/saving.json diff --git a/pdf_statement_reader/config/cba/saving.json b/pdf_statement_reader/config/cba/saving.json new file mode 100644 index 0000000..2dc540c --- /dev/null +++ b/pdf_statement_reader/config/cba/saving.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json", + "//": "Describe layout for pages to be scanned", + "layout": { + "//": "Default layout for all pages not otherwise defined", + "default": { + "//": "Page coordinates containing table in pts", + "//": "[top, left, bottom, right]", + "area": [143, 58, 760, 546], + "//": "Right x coordinate of each column in table", + "columns": [93, 344, 402, 460, 546] + }, + "//": "Layout for first page", + "first": { + "area": [385, 58, 760, 546], + "columns": [93, 344, 402, 460, 546] + } + }, + + "//": "Statement column names exactly as they appear", + "columns": { + "trans_date": "Date", + "trans_detail": "Transaction", + "debit": "Debit", + "credit": "Credit", + "balance": "Balance" + }, + + "//": "csv column output order", + "order": [ + "trans_date", + "trans_detail", + "debit", + "credit", + "balance" + ], + + "//": "Specify required cleaning operations", + "cleaning": { + "//": "Remove key column transactions using value", + "prestrip": ["trans_detail", "(?i)(^Balance.*Forward$)|(Closing Balance)"], + "//": "Convert these columns to numeric", + "numeric": ["debit", "credit", "balance"], + "//": "Convert these columns to date", + "date": ["trans_date"], + "//": "Use this date format to parse any date columns", + "date_format": "%d %b", + "trans_detail": "below", + "//": "Key column and value column to unwrap", + "unwrap": ["trans_date", "trans_detail"], + "//": "Change to Title Case", + "case": ["trans_detail"], + "//": "Only keep the rows where these columns are populated", + "dropna": ["trans_date"] + } +} From b3fe25ed8f09542b0c2d2b5be6c6e63b4fc1068d Mon Sep 17 00:00:00 2001 From: flywire Date: Wed, 23 Jun 2021 21:08:52 +1000 Subject: [PATCH 03/16] Draft tests for extended functionality --- tests/test_parse_methods.py | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/test_parse_methods.py b/tests/test_parse_methods.py index 23eaad3..1cdf2b1 100644 --- a/tests/test_parse_methods.py +++ b/tests/test_parse_methods.py @@ -1,4 +1,14 @@ from pdf_statement_reader.parse import format_negatives +from pdf_statement_reader.parse import format_currency_number +from pdf_statement_reader.parse import clean_prestrip +# from pdf_statement_reader.parse import clean_numeric +# from pdf_statement_reader.parse import clean_trans_detail +from pdf_statement_reader.parse import clean_unwrap +# from pdf_statement_reader.parse import clean_date +from pdf_statement_reader.parse import clean_case +# from pdf_statement_reader.parse import clean_dropna +# from pdf_statement_reader.parse import reorder_columns + def test_format_negatives(): assert format_negatives(123.45) == "123.45" @@ -8,3 +18,61 @@ def test_format_negatives(): assert format_negatives("-123.45") == "-123.45" assert format_negatives("0") == "0" assert format_negatives("123.45-") == "-123.45" + + +def test_format_currency_number(): + assert format_currency_number('123.45') == "123.45" + assert format_currency_number('$123.45') == "123.45" + assert format_currency_number('$123.45 CR') == "123.45" + assert format_currency_number('-1,234.56') == "-1234.56" + assert format_currency_number('1,234.56-') == "-1234.56" + assert format_currency_number('1,234.56 DR') == "-1234.56" + assert format_currency_number('-$1,234.56 dr') == "-1234.56" + assert format_currency_number('0') == "0" + assert format_currency_number('-1') == "-1" + assert format_currency_number('.12') == ".12" + assert format_currency_number('“1”') == "1" + + +def test_clean_prestrip(): + + _df = pd.DataFrame([['Field1':["",""], + 'Faction':["Test String", "Another test"]]) + _config = {'$schema': '', + 'columns': {'F1': 'Field1', 'Key': 'Faction'}, + 'cleaning': {'prestrip': ['Key', 'test']} + } + + df = _df + config = _config + assert clean_prestrip(df, config) == [['Field1':["","Test String"]] + + df = _df + config = _config + assert clean_prestrip(df, config) == [] + + +def test_clean_unwrap(): + _df = pd.DataFrame({['Date':["01/01/2020","","","26/05/2020"], + 'Faction':["Test String", "Another test","Short", "Last bit."]}) + _config = {'$schema': '', + 'columns': {'Key': 'Date', 'F2': 'Faction'}, + 'cleaning': {'unwrap': ['Key', 'F2']} + } + + df = _df + config = _config + assert clean_unwrap(df, config) == [['Date':["01/01/2020","","","26/05/2020"], + 'Faction':["Test String Another test Short", "Another test","Short", "Last bit."]] + + +def test_clean_case(): + _df = pd.DataFrame({['Faction':["test string", "ANOTHER TEst","shORT", "last Bit."]}) + _config = {'$schema': '', + 'columns': {'Key': 'Date', 'F2': 'Faction'}, + 'cleaning': {'unwrap': ['Key', 'F2']} + } + + df = _df + config = _config + assert clean_case(df, config) == ['Faction':["Test String", "Another Tst","Short", "Last Bit."] From 271df2842c28b1eaa482fdfe96882f5edb8b833f Mon Sep 17 00:00:00 2001 From: flywire Date: Wed, 23 Jun 2021 21:10:59 +1000 Subject: [PATCH 04/16] csv date_format='%d/%m/%y' --- pdf_statement_reader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf_statement_reader/__init__.py b/pdf_statement_reader/__init__.py index c9f06ea..8c33f73 100644 --- a/pdf_statement_reader/__init__.py +++ b/pdf_statement_reader/__init__.py @@ -78,7 +78,7 @@ def pdf2csv(input_filename, output_filename=None, config_spec=None): output_filename = input_filename.split(".pdf")[0] + ".csv" df = parse_statement(input_filename, config) - df.to_csv(output_filename, index=False, float_format="%.2f") + df.to_csv(output_filename, index=False, float_format="%.2f", date_format='%d/%m/%y') click.echo("Converted {} and saved as {}".format(input_filename, output_filename)) From 315c6ff8dfb584d32a6d9fff1d65f4def8216ee4 Mon Sep 17 00:00:00 2001 From: flywire Date: Wed, 23 Jun 2021 21:31:53 +1000 Subject: [PATCH 05/16] Remove debug code --- pdf_statement_reader/parse.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 375a945..c622138 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -107,10 +107,7 @@ def clean_date(df, config): year = df.iloc[0, 1][0:4] for col in date_cols: df[col] += " " + year - # print(df[col]) df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format + " %Y") - # print(type(df)) - print(df) def clean_case(df, config): @@ -154,8 +151,6 @@ def parse_statement(filename, config): if "case" in config["cleaning"]: statement = clean_case(statement, config) - print(statement.info()) - if "dropna" in config["cleaning"]: clean_dropna(statement, config) From 3d600f556444ab96c8113dda3e07cdc003f3139d Mon Sep 17 00:00:00 2001 From: flywire Date: Thu, 24 Jun 2021 22:47:35 +1000 Subject: [PATCH 06/16] Layout for no text-based header logging.debug lines retained --- pdf_statement_reader/parse.py | 61 ++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index c622138..4feae66 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -24,7 +24,7 @@ def get_raw_df(filename, num_pages, config): columns=columns, stream=True, guess=False, - pandas_options={"dtype": str}, + pandas_options={"dtype": str, "header": None}, java_options=[ "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", @@ -32,6 +32,10 @@ def get_raw_df(filename, num_pages, config): ) if df is not None and len(df) > 0: dfs.extend(df) + + if config["layout"]["pandas_options"]["header"] == "None": + for df in dfs: + df.columns = [config["columns"][col] for col in config["order"]] statement = pd.concat(dfs, sort=False).reset_index(drop=True) return statement @@ -39,15 +43,16 @@ def get_raw_df(filename, num_pages, config): def clean_prestrip(df, config): key = config["columns"][config["cleaning"]["prestrip"][0]] value = config["cleaning"]["prestrip"][1] + df.dropna(subset=[key], inplace=True) # df.ix[:, ~df[key].str.match(value)== True] df = df[~df[key].str.match(value) == True] return df def format_currency_number(s): - decimal_separator = "." - re_real = "[^\d" + decimal_separator + "]+" - re_negative = "(^-|(?i)DR)|(-|(?i)DR$)" + DECIMAL_SEPARATOR = "." + re_real = "[^\d" + DECIMAL_SEPARATOR + "]+" + re_negative = "(?i)(^-|DR)|(-|DR$)" s = str(s) flag_negative = True if bool(re.search(re_negative, s)) else False s = re.sub(re_real, "", s) @@ -86,17 +91,6 @@ def clean_trans_detail(df, config): df.loc[i - 1, trans_detail] = row[trans_type] -def clean_unwrap(df, config): - key = config["columns"][config["cleaning"]["unwrap"][0]] - value = config["columns"][config["cleaning"]["unwrap"][1]] - j = 0 - for i, row in df.iterrows(): - if pd.isnull(row[key]): - df.loc[j, value] += " " + df.loc[i, value] - else: - j = i - - def clean_date(df, config): date_cols = [config["columns"][col] for col in config["cleaning"]["date"]] if "date_format" in config["cleaning"]: @@ -104,10 +98,26 @@ def clean_date(df, config): else: date_format = None - year = df.iloc[0, 1][0:4] + cba = False # json setting needed + if cba: + year = df.iloc[0, 1][0:4] + date_format += " %Y" for col in date_cols: - df[col] += " " + year - df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format + " %Y") + if cba: + df[col] += " " + year + df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format) + + +def clean_unwrap(df, config): + key = config["columns"][config["cleaning"]["unwrap"][0]] + val = config["columns"][config["cleaning"]["unwrap"][1]] + j = 0 + for i, row in df.iterrows(): + if pd.isnull(row[key]): + if pd.notna(df.loc[i, val]): + df.loc[j, val] += " " + df.loc[i, val] + else: + j = i def clean_case(df, config): @@ -134,26 +144,33 @@ def parse_statement(filename, config): num_pages = len(pdf.pages) statement = get_raw_df(filename, num_pages, config) - logging.debug(statement["Transaction"]) + logging.debug(statement) + logging.debug("**" + "prestrip") if "prestrip" in config["cleaning"]: statement = clean_prestrip(statement, config) + logging.debug("**" + "numeric") if "numeric" in config["cleaning"]: clean_numeric(statement, config) - if "unwrap" in config["cleaning"]: - clean_unwrap(statement, config) - + logging.debug("**" + "date") if "date" in config["cleaning"]: clean_date(statement, config) + logging.debug("**" + "unwrap") + if "unwrap" in config["cleaning"]: + clean_unwrap(statement, config) + + logging.debug("**" + "case") if "case" in config["cleaning"]: statement = clean_case(statement, config) + logging.debug("**" + "dropna") if "dropna" in config["cleaning"]: clean_dropna(statement, config) + logging.debug("**" + "order") if "order" in config: statement = reorder_columns(statement, config) From da397e0eb5aae5d1000687c483ce9e55bb7a06b4 Mon Sep 17 00:00:00 2001 From: flywire Date: Thu, 24 Jun 2021 22:47:40 +1000 Subject: [PATCH 07/16] citi cheque.json Uses extended functionality --- pdf_statement_reader/config/citi/cheque.json | 60 ++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 pdf_statement_reader/config/citi/cheque.json diff --git a/pdf_statement_reader/config/citi/cheque.json b/pdf_statement_reader/config/citi/cheque.json new file mode 100644 index 0000000..d6283bd --- /dev/null +++ b/pdf_statement_reader/config/citi/cheque.json @@ -0,0 +1,60 @@ +{ + "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json", + "//": "Describe layout for pages to be scanned", + "layout": { + "//": "Default layout for all pages not otherwise defined", + "default": { + "//": "Page coordinates containing table in pts", + "//": "[top, left, bottom, right]", + "area": [190, 47, 800, 558], + "//": "Right x coordinate of each column in table", + "columns": [105, 275, 340, 444, 558] + }, + "//": "Layout for first page", + "first": { + "area": [530, 47, 800, 558], + "columns": [105, 275, 340, 444, 558] + }, + "//": "Layout for no text-based header", + "pandas_options": { + "header": "None" + } + }, + + "//": "Statement column names exactly as they appear", + "columns": { + "trans_date": "Date", + "trans_detail": "Description", + "debit": "Debit", + "credit": "Credit", + "balance": "Balance" + }, + + "//": "csv column output order", + "order": [ + "trans_date", + "trans_detail", + "debit", + "credit", + "balance" + ], + + "//": "Specify required cleaning operations", + "cleaning": { + "//": "Remove key column transactions using value", + "prestrip": ["trans_detail", "(?i)(.*Balance$)|(Tota)"], + "//": "Convert these columns to numeric", + "numeric": ["debit", "credit", "balance"], + "//": "Convert these columns to date", + "date": ["trans_date"], + "//": "Use this date format to parse any date columns", + "date_format": "%d %b %Y", + "trans_detail": "below", + "//": "Key column and value column to unwrap", + "unwrap": ["trans_date", "trans_detail"], + "//": "Change to Title Case", + "case": ["trans_detail"], + "//": "Only keep the rows where these columns are populated", + "dropna": ["trans_date"] + } +} From e97cd9d059f9a85894e5d752bc69d5f9dc30434a Mon Sep 17 00:00:00 2001 From: flywire Date: Fri, 25 Jun 2021 13:56:16 +1000 Subject: [PATCH 08/16] clean_truncate removes misidentified trailing data --- pdf_statement_reader/parse.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 4feae66..d60f3b3 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -8,6 +8,9 @@ def get_raw_df(filename, num_pages, config): dfs = [] + pandas_options={"dtype": str}, + if config["layout"]["pandas_options"]["header"] == "None": + pandas_options={"dtype": str, "header": None} for i in range(num_pages): if i == 0 and "first" in config["layout"]: @@ -24,7 +27,7 @@ def get_raw_df(filename, num_pages, config): columns=columns, stream=True, guess=False, - pandas_options={"dtype": str, "header": None}, + pandas_options={"dtype": str}, java_options=[ "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", @@ -40,6 +43,14 @@ def get_raw_df(filename, num_pages, config): return statement +def clean_truncate(df, config): + key = config["columns"][config["cleaning"]["truncate"][0]] + value = config["cleaning"]["truncate"][1] + if not df[df[key]==value].empty: + df = df.iloc[:df[df[key]==value].index[0]] + return df + + def clean_prestrip(df, config): key = config["columns"][config["cleaning"]["prestrip"][0]] value = config["cleaning"]["prestrip"][1] @@ -146,32 +157,36 @@ def parse_statement(filename, config): statement = get_raw_df(filename, num_pages, config) logging.debug(statement) - logging.debug("**" + "prestrip") + if "truncate" in config["cleaning"]: + logging.debug("**" + "truncate") + statement = clean_truncate(statement, config) + if "prestrip" in config["cleaning"]: + logging.debug("**" + "prestrip") statement = clean_prestrip(statement, config) - logging.debug("**" + "numeric") if "numeric" in config["cleaning"]: + logging.debug("**" + "numeric") clean_numeric(statement, config) - logging.debug("**" + "date") if "date" in config["cleaning"]: + logging.debug("**" + "date") clean_date(statement, config) - logging.debug("**" + "unwrap") if "unwrap" in config["cleaning"]: + logging.debug("**" + "unwrap") clean_unwrap(statement, config) - logging.debug("**" + "case") if "case" in config["cleaning"]: + logging.debug("**" + "case") statement = clean_case(statement, config) - logging.debug("**" + "dropna") if "dropna" in config["cleaning"]: + logging.debug("**" + "dropna") clean_dropna(statement, config) - logging.debug("**" + "order") if "order" in config: + logging.debug("**" + "order") statement = reorder_columns(statement, config) return statement From 72840a699fb5383e976739c2244cab7f21426713 Mon Sep 17 00:00:00 2001 From: flywire Date: Fri, 25 Jun 2021 14:05:39 +1000 Subject: [PATCH 09/16] pandas_options --- pdf_statement_reader/parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index d60f3b3..c97e3f0 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -8,9 +8,9 @@ def get_raw_df(filename, num_pages, config): dfs = [] - pandas_options={"dtype": str}, + _pandas_options={"dtype": str} if config["layout"]["pandas_options"]["header"] == "None": - pandas_options={"dtype": str, "header": None} + _pandas_options={"dtype": str, "header": None} for i in range(num_pages): if i == 0 and "first" in config["layout"]: @@ -27,7 +27,7 @@ def get_raw_df(filename, num_pages, config): columns=columns, stream=True, guess=False, - pandas_options={"dtype": str}, + pandas_options=_pandas_options, java_options=[ "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", From 44c948f1e8b371c58e186ce5edf325958bbd7c34 Mon Sep 17 00:00:00 2001 From: flywire Date: Fri, 25 Jun 2021 19:54:23 +1000 Subject: [PATCH 10/16] Process date_format "%d %b" with no year --- pdf_statement_reader/parse.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index c97e3f0..37125ae 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -108,13 +108,14 @@ def clean_date(df, config): date_format = config["cleaning"]["date_format"] else: date_format = None - - cba = False # json setting needed - if cba: + if date_format == "%d %b": + no_year = True year = df.iloc[0, 1][0:4] date_format += " %Y" + else: + no_year = False for col in date_cols: - if cba: + if no_year: df[col] += " " + year df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format) From 83fae2bbffbb023370b8712ea0585ceae6ca990c Mon Sep 17 00:00:00 2001 From: flywire Date: Fri, 25 Jun 2021 21:48:04 +1000 Subject: [PATCH 11/16] json truncate removes misidentified trailing data --- pdf_statement_reader/config/citi/cheque.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pdf_statement_reader/config/citi/cheque.json b/pdf_statement_reader/config/citi/cheque.json index d6283bd..a1759a8 100644 --- a/pdf_statement_reader/config/citi/cheque.json +++ b/pdf_statement_reader/config/citi/cheque.json @@ -41,6 +41,8 @@ "//": "Specify required cleaning operations", "cleaning": { + "//": "Truncate from key column transactions using value", + "truncate": ["trans_detail", "CLOSING BALANCE"], "//": "Remove key column transactions using value", "prestrip": ["trans_detail", "(?i)(.*Balance$)|(Tota)"], "//": "Convert these columns to numeric", From 31b5ad9d0b14124cbfbc3fa2458ec4506b0718a7 Mon Sep 17 00:00:00 2001 From: flywire Date: Sat, 26 Jun 2021 00:03:39 +1000 Subject: [PATCH 12/16] pandas_options not required in json --- pdf_statement_reader/parse.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 37125ae..97c9996 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -9,8 +9,12 @@ def get_raw_df(filename, num_pages, config): dfs = [] _pandas_options={"dtype": str} - if config["layout"]["pandas_options"]["header"] == "None": - _pandas_options={"dtype": str, "header": None} + header = True + if config["layout"].get("pandas_options"): + _pandas_options.update(config["layout"].get("pandas_options")) + if config["layout"]["pandas_options"].get("header") \ + and config["layout"]["pandas_options"].get("header") == "None": + header = False for i in range(num_pages): if i == 0 and "first" in config["layout"]: @@ -36,7 +40,7 @@ def get_raw_df(filename, num_pages, config): if df is not None and len(df) > 0: dfs.extend(df) - if config["layout"]["pandas_options"]["header"] == "None": + if not header: for df in dfs: df.columns = [config["columns"][col] for col in config["order"]] statement = pd.concat(dfs, sort=False).reset_index(drop=True) From 18a56cbb2bb30e3ccb211fb2dcdbf1ed293c4bf0 Mon Sep 17 00:00:00 2001 From: flywire Date: Sun, 27 Jun 2021 00:05:35 +1000 Subject: [PATCH 13/16] test_clean_case --- tests/test_parse_methods.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_parse_methods.py b/tests/test_parse_methods.py index 1cdf2b1..3b7e214 100644 --- a/tests/test_parse_methods.py +++ b/tests/test_parse_methods.py @@ -1,3 +1,7 @@ +import unittest +import pandas as pd +from pandas._testing import assert_frame_equal + from pdf_statement_reader.parse import format_negatives from pdf_statement_reader.parse import format_currency_number from pdf_statement_reader.parse import clean_prestrip @@ -67,12 +71,11 @@ def test_clean_unwrap(): def test_clean_case(): - _df = pd.DataFrame({['Faction':["test string", "ANOTHER TEst","shORT", "last Bit."]}) - _config = {'$schema': '', - 'columns': {'Key': 'Date', 'F2': 'Faction'}, - 'cleaning': {'unwrap': ['Key', 'F2']} + df1 = pd.DataFrame({'Faction':["test string", "ANOTHER TEst","shORT", "last Bit."]}) + df2 = pd.DataFrame({'Faction':["Test String", "Another Test","Short", "Last Bit."]}) + config = {'$schema': '', + 'cleaning': {'case': ['F1']}, + 'columns': {'F1': 'Faction'} } - df = _df - config = _config - assert clean_case(df, config) == ['Faction':["Test String", "Another Tst","Short", "Last Bit."] + assert_frame_equal(clean_case(df1, config), df2) From f8231f8a47a308d2929a080b519d4bd7b219e058 Mon Sep 17 00:00:00 2001 From: flywire Date: Sun, 27 Jun 2021 10:11:08 +1000 Subject: [PATCH 14/16] test_clean_prestrip --- tests/test_parse_methods.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/test_parse_methods.py b/tests/test_parse_methods.py index 3b7e214..30fa8be 100644 --- a/tests/test_parse_methods.py +++ b/tests/test_parse_methods.py @@ -39,21 +39,17 @@ def test_format_currency_number(): def test_clean_prestrip(): - - _df = pd.DataFrame([['Field1':["",""], - 'Faction':["Test String", "Another test"]]) - _config = {'$schema': '', - 'columns': {'F1': 'Field1', 'Key': 'Faction'}, - 'cleaning': {'prestrip': ['Key', 'test']} - } + df1 = pd.DataFrame({"Field1": ["", ""], "Faction": ["Test String", "Another test"]}) + df2 = pd.DataFrame({"Field1": [""], "Faction": ["Another test"]}) + config = { + "$schema": "", + "cleaning": {"prestrip": ["Key", "Test"]}, + "columns": {"F1": "Field1", "Key": "Faction"}, + } - df = _df - config = _config - assert clean_prestrip(df, config) == [['Field1':["","Test String"]] - - df = _df - config = _config - assert clean_prestrip(df, config) == [] + assert_frame_equal( + clean_prestrip(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) def test_clean_unwrap(): From 1e65da1b6e01ae51e3798539442f681447a296c0 Mon Sep 17 00:00:00 2001 From: flywire Date: Sun, 27 Jun 2021 21:53:39 +1000 Subject: [PATCH 15/16] Explicit returns on all functions to support `assert` in tests --- pdf_statement_reader/parse.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 97c9996..82a540a 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -8,13 +8,15 @@ def get_raw_df(filename, num_pages, config): dfs = [] - _pandas_options={"dtype": str} + _pandas_options = {"dtype": str} header = True if config["layout"].get("pandas_options"): _pandas_options.update(config["layout"].get("pandas_options")) - if config["layout"]["pandas_options"].get("header") \ - and config["layout"]["pandas_options"].get("header") == "None": - header = False + if ( + config["layout"]["pandas_options"].get("header") + and config["layout"]["pandas_options"].get("header") == "None" + ): + header = False for i in range(num_pages): if i == 0 and "first" in config["layout"]: @@ -50,8 +52,8 @@ def get_raw_df(filename, num_pages, config): def clean_truncate(df, config): key = config["columns"][config["cleaning"]["truncate"][0]] value = config["cleaning"]["truncate"][1] - if not df[df[key]==value].empty: - df = df.iloc[:df[df[key]==value].index[0]] + if not df[df[key] == value].empty: + df = df.iloc[: df[df[key] == value].index[0]] return df @@ -90,6 +92,7 @@ def clean_numeric(df, config): for col in numeric_cols: df[col] = df[col].apply(format_currency_number) df[col] = pd.to_numeric(df[col], errors="coerce") + return df def clean_trans_detail(df, config): @@ -104,6 +107,7 @@ def clean_trans_detail(df, config): continue if np.isnan(row[balance]): df.loc[i - 1, trans_detail] = row[trans_type] + return df def clean_date(df, config): @@ -122,6 +126,7 @@ def clean_date(df, config): if no_year: df[col] += " " + year df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format) + return df def clean_unwrap(df, config): @@ -134,6 +139,7 @@ def clean_unwrap(df, config): df.loc[j, val] += " " + df.loc[i, val] else: j = i + return df def clean_case(df, config): @@ -146,6 +152,7 @@ def clean_case(df, config): def clean_dropna(df, config): drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]] df.dropna(subset=drop_cols, inplace=True) + return df def reorder_columns(df, config): @@ -172,15 +179,15 @@ def parse_statement(filename, config): if "numeric" in config["cleaning"]: logging.debug("**" + "numeric") - clean_numeric(statement, config) + statement = clean_numeric(statement, config) if "date" in config["cleaning"]: logging.debug("**" + "date") - clean_date(statement, config) + statement = clean_date(statement, config) if "unwrap" in config["cleaning"]: logging.debug("**" + "unwrap") - clean_unwrap(statement, config) + statement = clean_unwrap(statement, config) if "case" in config["cleaning"]: logging.debug("**" + "case") @@ -188,7 +195,7 @@ def parse_statement(filename, config): if "dropna" in config["cleaning"]: logging.debug("**" + "dropna") - clean_dropna(statement, config) + statement = clean_dropna(statement, config) if "order" in config: logging.debug("**" + "order") From 6fcd0ac5e62b29c752263c8e95a680a1f72eb71a Mon Sep 17 00:00:00 2001 From: flywire Date: Mon, 28 Jun 2021 00:09:46 +1000 Subject: [PATCH 16/16] tests clean_date clean_dropna reorder_columns test_clean_date test_clean_dropna test_reorder_columns --- tests/test_parse_methods.py | 120 ++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 32 deletions(-) diff --git a/tests/test_parse_methods.py b/tests/test_parse_methods.py index 30fa8be..adb42dc 100644 --- a/tests/test_parse_methods.py +++ b/tests/test_parse_methods.py @@ -8,10 +8,10 @@ # from pdf_statement_reader.parse import clean_numeric # from pdf_statement_reader.parse import clean_trans_detail from pdf_statement_reader.parse import clean_unwrap -# from pdf_statement_reader.parse import clean_date +from pdf_statement_reader.parse import clean_date from pdf_statement_reader.parse import clean_case -# from pdf_statement_reader.parse import clean_dropna -# from pdf_statement_reader.parse import reorder_columns +from pdf_statement_reader.parse import clean_dropna +from pdf_statement_reader.parse import reorder_columns def test_format_negatives(): @@ -25,17 +25,17 @@ def test_format_negatives(): def test_format_currency_number(): - assert format_currency_number('123.45') == "123.45" - assert format_currency_number('$123.45') == "123.45" - assert format_currency_number('$123.45 CR') == "123.45" - assert format_currency_number('-1,234.56') == "-1234.56" - assert format_currency_number('1,234.56-') == "-1234.56" - assert format_currency_number('1,234.56 DR') == "-1234.56" - assert format_currency_number('-$1,234.56 dr') == "-1234.56" - assert format_currency_number('0') == "0" - assert format_currency_number('-1') == "-1" - assert format_currency_number('.12') == ".12" - assert format_currency_number('“1”') == "1" + assert format_currency_number("123.45") == "123.45" + assert format_currency_number("$123.45") == "123.45" + assert format_currency_number("$123.45 CR") == "123.45" + assert format_currency_number("-1,234.56") == "-1234.56" + assert format_currency_number("1,234.56-") == "-1234.56" + assert format_currency_number("1,234.56 DR") == "-1234.56" + assert format_currency_number("-$1,234.56 dr") == "-1234.56" + assert format_currency_number("0") == "0" + assert format_currency_number("-1") == "-1" + assert format_currency_number(".12") == ".12" + assert format_currency_number("“1”") == "1" def test_clean_prestrip(): @@ -46,32 +46,88 @@ def test_clean_prestrip(): "cleaning": {"prestrip": ["Key", "Test"]}, "columns": {"F1": "Field1", "Key": "Faction"}, } - assert_frame_equal( clean_prestrip(df1, config).reset_index(drop=True), df2.reset_index(drop=True) ) def test_clean_unwrap(): - _df = pd.DataFrame({['Date':["01/01/2020","","","26/05/2020"], - 'Faction':["Test String", "Another test","Short", "Last bit."]}) - _config = {'$schema': '', - 'columns': {'Key': 'Date', 'F2': 'Faction'}, - 'cleaning': {'unwrap': ['Key', 'F2']} - } + df1 = pd.DataFrame( + { + "Date": ["01/01/2020", pd.NA, pd.NA, "26/05/2020"], + "Faction": ["Test String", "Another test", "Short", "Last bit."], + } + ) + df2 = pd.DataFrame( + { + "Date": ["01/01/2020", pd.NA, pd.NA, "26/05/2020"], + "Faction": [ + "Test String Another test Short", + "Another test", + "Short", + "Last bit.", + ], + } + ) + config = { + "$schema": "", + "columns": {"Key": "Date", "F2": "Faction"}, + "cleaning": {"unwrap": ["Key", "F2"]}, + } + assert_frame_equal(clean_unwrap(df1, config), df2) - df = _df - config = _config - assert clean_unwrap(df, config) == [['Date':["01/01/2020","","","26/05/2020"], - 'Faction':["Test String Another test Short", "Another test","Short", "Last bit."]] +def test_clean_date(): + df1 = pd.DataFrame({"Faction": ["01/02/03"]}) + df2 = pd.DataFrame({"Faction": ["2003-02-01"]}) + df2["Faction"] = pd.to_datetime(df2["Faction"]) + config = { + "$schema": "", + "cleaning": { + "date": ["F1"], + "date_format": "%d/%m/%y", + }, + "columns": {"F1": "Faction"}, + } + assert_frame_equal(clean_date(df1, config), df2) -def test_clean_case(): - df1 = pd.DataFrame({'Faction':["test string", "ANOTHER TEst","shORT", "last Bit."]}) - df2 = pd.DataFrame({'Faction':["Test String", "Another Test","Short", "Last Bit."]}) - config = {'$schema': '', - 'cleaning': {'case': ['F1']}, - 'columns': {'F1': 'Faction'} - } +def test_clean_case(): + df1 = pd.DataFrame( + {"Faction": ["test string", "ANOTHER TEst", "shORT", "last Bit."]} + ) + df2 = pd.DataFrame( + {"Faction": ["Test String", "Another Test", "Short", "Last Bit."]} + ) + config = {"$schema": "", "cleaning": {"case": ["F1"]}, "columns": {"F1": "Faction"}} assert_frame_equal(clean_case(df1, config), df2) + + +def test_clean_dropna(): + df1 = pd.DataFrame( + {"Field1": [pd.NA, "1.23"], "Faction": ["Test String", "Another test"]} + ) + df2 = pd.DataFrame({"Field1": ["1.23"], "Faction": ["Another test"]}) + config = { + "$schema": "", + "cleaning": {"dropna": ["F1"]}, + "columns": {"F1": "Field1"}, + } + assert_frame_equal( + clean_dropna(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) + + +def test_reorder_columns(): + df1 = pd.DataFrame({"Col1": ["", ""], "Col2": ["Aa", "bB"], "Col3": ["xx", "ZZ"]}) + df2 = pd.DataFrame({"Col1": ["", ""], "Col3": ["xx", "ZZ"], "Col2": ["Aa", "bB"]}) + config = {"$schema": "", "order": ["Col1", "Col3", "Col2"]} + config = { + "$schema": "", + "order": ["F1", "F3", "F2"], + "columns": {"F1": "Col1", "F2": "Col2", "F3": "Col3"}, + } + assert_frame_equal( + reorder_columns(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) +