diff --git a/pdf_statement_reader/__init__.py b/pdf_statement_reader/__init__.py index c9f06ea..8c33f73 100644 --- a/pdf_statement_reader/__init__.py +++ b/pdf_statement_reader/__init__.py @@ -78,7 +78,7 @@ def pdf2csv(input_filename, output_filename=None, config_spec=None): output_filename = input_filename.split(".pdf")[0] + ".csv" df = parse_statement(input_filename, config) - df.to_csv(output_filename, index=False, float_format="%.2f") + df.to_csv(output_filename, index=False, float_format="%.2f", date_format='%d/%m/%y') click.echo("Converted {} and saved as {}".format(input_filename, output_filename)) diff --git a/pdf_statement_reader/config/cba/saving.json b/pdf_statement_reader/config/cba/saving.json new file mode 100644 index 0000000..2dc540c --- /dev/null +++ b/pdf_statement_reader/config/cba/saving.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json", + "//": "Describe layout for pages to be scanned", + "layout": { + "//": "Default layout for all pages not otherwise defined", + "default": { + "//": "Page coordinates containing table in pts", + "//": "[top, left, bottom, right]", + "area": [143, 58, 760, 546], + "//": "Right x coordinate of each column in table", + "columns": [93, 344, 402, 460, 546] + }, + "//": "Layout for first page", + "first": { + "area": [385, 58, 760, 546], + "columns": [93, 344, 402, 460, 546] + } + }, + + "//": "Statement column names exactly as they appear", + "columns": { + "trans_date": "Date", + "trans_detail": "Transaction", + "debit": "Debit", + "credit": "Credit", + "balance": "Balance" + }, + + "//": "csv column output order", + "order": [ + "trans_date", + "trans_detail", + "debit", + "credit", + "balance" + ], + + "//": "Specify required cleaning operations", + "cleaning": { + "//": "Remove key column transactions using value", + "prestrip": ["trans_detail", "(?i)(^Balance.*Forward$)|(Closing Balance)"], + "//": "Convert these columns to numeric", + "numeric": ["debit", "credit", "balance"], + "//": "Convert these columns to date", + "date": ["trans_date"], + "//": "Use this date format to parse any date columns", + "date_format": "%d %b", + "trans_detail": "below", + "//": "Key column and value column to unwrap", + "unwrap": ["trans_date", "trans_detail"], + "//": "Change to Title Case", + "case": ["trans_detail"], + "//": "Only keep the rows where these columns are populated", + "dropna": ["trans_date"] + } +} diff --git a/pdf_statement_reader/config/citi/cheque.json b/pdf_statement_reader/config/citi/cheque.json new file mode 100644 index 0000000..a1759a8 --- /dev/null +++ b/pdf_statement_reader/config/citi/cheque.json @@ -0,0 +1,62 @@ +{ + "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json", + "//": "Describe layout for pages to be scanned", + "layout": { + "//": "Default layout for all pages not otherwise defined", + "default": { + "//": "Page coordinates containing table in pts", + "//": "[top, left, bottom, right]", + "area": [190, 47, 800, 558], + "//": "Right x coordinate of each column in table", + "columns": [105, 275, 340, 444, 558] + }, + "//": "Layout for first page", + "first": { + "area": [530, 47, 800, 558], + "columns": [105, 275, 340, 444, 558] + }, + "//": "Layout for no text-based header", + "pandas_options": { + "header": "None" + } + }, + + "//": "Statement column names exactly as they appear", + "columns": { + "trans_date": "Date", + "trans_detail": "Description", + "debit": "Debit", + "credit": "Credit", + "balance": "Balance" + }, + + "//": "csv column output order", + "order": [ + "trans_date", + "trans_detail", + "debit", + "credit", + "balance" + ], + + "//": "Specify required cleaning operations", + "cleaning": { + "//": "Truncate from key column transactions using value", + "truncate": ["trans_detail", "CLOSING BALANCE"], + "//": "Remove key column transactions using value", + "prestrip": ["trans_detail", "(?i)(.*Balance$)|(Tota)"], + "//": "Convert these columns to numeric", + "numeric": ["debit", "credit", "balance"], + "//": "Convert these columns to date", + "date": ["trans_date"], + "//": "Use this date format to parse any date columns", + "date_format": "%d %b %Y", + "trans_detail": "below", + "//": "Key column and value column to unwrap", + "unwrap": ["trans_date", "trans_detail"], + "//": "Change to Title Case", + "case": ["trans_detail"], + "//": "Only keep the rows where these columns are populated", + "dropna": ["trans_date"] + } +} diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py index 5a5ac10..82a540a 100644 --- a/pdf_statement_reader/parse.py +++ b/pdf_statement_reader/parse.py @@ -2,10 +2,21 @@ from pikepdf import Pdf import pandas as pd import numpy as np +import re +import logging def get_raw_df(filename, num_pages, config): dfs = [] + _pandas_options = {"dtype": str} + header = True + if config["layout"].get("pandas_options"): + _pandas_options.update(config["layout"].get("pandas_options")) + if ( + config["layout"]["pandas_options"].get("header") + and config["layout"]["pandas_options"].get("header") == "None" + ): + header = False for i in range(num_pages): if i == 0 and "first" in config["layout"]: @@ -22,18 +33,51 @@ def get_raw_df(filename, num_pages, config): columns=columns, stream=True, guess=False, - pandas_options={"dtype": str}, + pandas_options=_pandas_options, java_options=[ "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", - "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog" - ] + "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", + ], ) if df is not None and len(df) > 0: dfs.extend(df) + + if not header: + for df in dfs: + df.columns = [config["columns"][col] for col in config["order"]] statement = pd.concat(dfs, sort=False).reset_index(drop=True) return statement +def clean_truncate(df, config): + key = config["columns"][config["cleaning"]["truncate"][0]] + value = config["cleaning"]["truncate"][1] + if not df[df[key] == value].empty: + df = df.iloc[: df[df[key] == value].index[0]] + return df + + +def clean_prestrip(df, config): + key = config["columns"][config["cleaning"]["prestrip"][0]] + value = config["cleaning"]["prestrip"][1] + df.dropna(subset=[key], inplace=True) + # df.ix[:, ~df[key].str.match(value)== True] + df = df[~df[key].str.match(value) == True] + return df + + +def format_currency_number(s): + DECIMAL_SEPARATOR = "." + re_real = "[^\d" + DECIMAL_SEPARATOR + "]+" + re_negative = "(?i)(^-|DR)|(-|DR$)" + s = str(s) + flag_negative = True if bool(re.search(re_negative, s)) else False + s = re.sub(re_real, "", s) + if flag_negative: + s = "-" + s + return s + + def format_negatives(s): s = str(s) if s.endswith("-"): @@ -45,29 +89,10 @@ def format_negatives(s): def clean_numeric(df, config): numeric_cols = [config["columns"][col] for col in config["cleaning"]["numeric"]] - for col in numeric_cols: - df[col] = df[col].apply(format_negatives) - df[col] = df[col].str.replace(" ", "") - df[col] = pd.to_numeric( - df[col], - errors="coerce" - ) - - -def clean_date(df, config): - date_cols = [config["columns"][col] for col in config["cleaning"]["date"]] - if "date_format" in config["cleaning"]: - date_format = config["cleaning"]["date_format"] - else: - date_format = None - - for col in date_cols: - df[col] = pd.to_datetime( - df[col], - errors="coerce", - format=date_format - ) + df[col] = df[col].apply(format_currency_number) + df[col] = pd.to_numeric(df[col], errors="coerce") + return df def clean_trans_detail(df, config): @@ -82,11 +107,52 @@ def clean_trans_detail(df, config): continue if np.isnan(row[balance]): df.loc[i - 1, trans_detail] = row[trans_type] + return df + + +def clean_date(df, config): + date_cols = [config["columns"][col] for col in config["cleaning"]["date"]] + if "date_format" in config["cleaning"]: + date_format = config["cleaning"]["date_format"] + else: + date_format = None + if date_format == "%d %b": + no_year = True + year = df.iloc[0, 1][0:4] + date_format += " %Y" + else: + no_year = False + for col in date_cols: + if no_year: + df[col] += " " + year + df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format) + return df + + +def clean_unwrap(df, config): + key = config["columns"][config["cleaning"]["unwrap"][0]] + val = config["columns"][config["cleaning"]["unwrap"][1]] + j = 0 + for i, row in df.iterrows(): + if pd.isnull(row[key]): + if pd.notna(df.loc[i, val]): + df.loc[j, val] += " " + df.loc[i, val] + else: + j = i + return df + + +def clean_case(df, config): + cols = [config["columns"][col] for col in config["cleaning"]["case"]] + for col in cols: + df[col] = df[col].str.title() + return df def clean_dropna(df, config): drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]] df.dropna(subset=drop_cols, inplace=True) + return df def reorder_columns(df, config): @@ -95,24 +161,44 @@ def reorder_columns(df, config): def parse_statement(filename, config): + logging.basicConfig(level=logging.ERROR) + logger = logging.getLogger() pdf = Pdf.open(filename) num_pages = len(pdf.pages) statement = get_raw_df(filename, num_pages, config) + logging.debug(statement) + + if "truncate" in config["cleaning"]: + logging.debug("**" + "truncate") + statement = clean_truncate(statement, config) + + if "prestrip" in config["cleaning"]: + logging.debug("**" + "prestrip") + statement = clean_prestrip(statement, config) if "numeric" in config["cleaning"]: - clean_numeric(statement, config) + logging.debug("**" + "numeric") + statement = clean_numeric(statement, config) - if "trans_detail" in config["cleaning"]: - clean_trans_detail(statement, config) - if "date" in config["cleaning"]: - clean_date(statement, config) - + logging.debug("**" + "date") + statement = clean_date(statement, config) + + if "unwrap" in config["cleaning"]: + logging.debug("**" + "unwrap") + statement = clean_unwrap(statement, config) + + if "case" in config["cleaning"]: + logging.debug("**" + "case") + statement = clean_case(statement, config) + if "dropna" in config["cleaning"]: - clean_dropna(statement, config) + logging.debug("**" + "dropna") + statement = clean_dropna(statement, config) if "order" in config: + logging.debug("**" + "order") statement = reorder_columns(statement, config) return statement diff --git a/tests/test_parse_methods.py b/tests/test_parse_methods.py index 23eaad3..adb42dc 100644 --- a/tests/test_parse_methods.py +++ b/tests/test_parse_methods.py @@ -1,4 +1,18 @@ +import unittest +import pandas as pd +from pandas._testing import assert_frame_equal + from pdf_statement_reader.parse import format_negatives +from pdf_statement_reader.parse import format_currency_number +from pdf_statement_reader.parse import clean_prestrip +# from pdf_statement_reader.parse import clean_numeric +# from pdf_statement_reader.parse import clean_trans_detail +from pdf_statement_reader.parse import clean_unwrap +from pdf_statement_reader.parse import clean_date +from pdf_statement_reader.parse import clean_case +from pdf_statement_reader.parse import clean_dropna +from pdf_statement_reader.parse import reorder_columns + def test_format_negatives(): assert format_negatives(123.45) == "123.45" @@ -8,3 +22,112 @@ def test_format_negatives(): assert format_negatives("-123.45") == "-123.45" assert format_negatives("0") == "0" assert format_negatives("123.45-") == "-123.45" + + +def test_format_currency_number(): + assert format_currency_number("123.45") == "123.45" + assert format_currency_number("$123.45") == "123.45" + assert format_currency_number("$123.45 CR") == "123.45" + assert format_currency_number("-1,234.56") == "-1234.56" + assert format_currency_number("1,234.56-") == "-1234.56" + assert format_currency_number("1,234.56 DR") == "-1234.56" + assert format_currency_number("-$1,234.56 dr") == "-1234.56" + assert format_currency_number("0") == "0" + assert format_currency_number("-1") == "-1" + assert format_currency_number(".12") == ".12" + assert format_currency_number("“1”") == "1" + + +def test_clean_prestrip(): + df1 = pd.DataFrame({"Field1": ["", ""], "Faction": ["Test String", "Another test"]}) + df2 = pd.DataFrame({"Field1": [""], "Faction": ["Another test"]}) + config = { + "$schema": "", + "cleaning": {"prestrip": ["Key", "Test"]}, + "columns": {"F1": "Field1", "Key": "Faction"}, + } + assert_frame_equal( + clean_prestrip(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) + + +def test_clean_unwrap(): + df1 = pd.DataFrame( + { + "Date": ["01/01/2020", pd.NA, pd.NA, "26/05/2020"], + "Faction": ["Test String", "Another test", "Short", "Last bit."], + } + ) + df2 = pd.DataFrame( + { + "Date": ["01/01/2020", pd.NA, pd.NA, "26/05/2020"], + "Faction": [ + "Test String Another test Short", + "Another test", + "Short", + "Last bit.", + ], + } + ) + config = { + "$schema": "", + "columns": {"Key": "Date", "F2": "Faction"}, + "cleaning": {"unwrap": ["Key", "F2"]}, + } + assert_frame_equal(clean_unwrap(df1, config), df2) + + +def test_clean_date(): + df1 = pd.DataFrame({"Faction": ["01/02/03"]}) + df2 = pd.DataFrame({"Faction": ["2003-02-01"]}) + df2["Faction"] = pd.to_datetime(df2["Faction"]) + config = { + "$schema": "", + "cleaning": { + "date": ["F1"], + "date_format": "%d/%m/%y", + }, + "columns": {"F1": "Faction"}, + } + assert_frame_equal(clean_date(df1, config), df2) + + +def test_clean_case(): + df1 = pd.DataFrame( + {"Faction": ["test string", "ANOTHER TEst", "shORT", "last Bit."]} + ) + df2 = pd.DataFrame( + {"Faction": ["Test String", "Another Test", "Short", "Last Bit."]} + ) + config = {"$schema": "", "cleaning": {"case": ["F1"]}, "columns": {"F1": "Faction"}} + assert_frame_equal(clean_case(df1, config), df2) + + +def test_clean_dropna(): + df1 = pd.DataFrame( + {"Field1": [pd.NA, "1.23"], "Faction": ["Test String", "Another test"]} + ) + df2 = pd.DataFrame({"Field1": ["1.23"], "Faction": ["Another test"]}) + config = { + "$schema": "", + "cleaning": {"dropna": ["F1"]}, + "columns": {"F1": "Field1"}, + } + assert_frame_equal( + clean_dropna(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) + + +def test_reorder_columns(): + df1 = pd.DataFrame({"Col1": ["", ""], "Col2": ["Aa", "bB"], "Col3": ["xx", "ZZ"]}) + df2 = pd.DataFrame({"Col1": ["", ""], "Col3": ["xx", "ZZ"], "Col2": ["Aa", "bB"]}) + config = {"$schema": "", "order": ["Col1", "Col3", "Col2"]} + config = { + "$schema": "", + "order": ["F1", "F3", "F2"], + "columns": {"F1": "Col1", "F2": "Col2", "F3": "Col3"}, + } + assert_frame_equal( + reorder_columns(df1, config).reset_index(drop=True), df2.reset_index(drop=True) + ) +