Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced parsing functionality #50

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion pdf_statement_reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def pdf2csv(input_filename, output_filename=None, config_spec=None):
output_filename = input_filename.split(".pdf")[0] + ".csv"

df = parse_statement(input_filename, config)
df.to_csv(output_filename, index=False, float_format="%.2f")
df.to_csv(output_filename, index=False, float_format="%.2f", date_format='%d/%m/%y')
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest json config support optional second date format for output

click.echo("Converted {} and saved as {}".format(input_filename, output_filename))


Expand Down
56 changes: 56 additions & 0 deletions pdf_statement_reader/config/cba/saving.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
"//": "Describe layout for pages to be scanned",
"layout": {
"//": "Default layout for all pages not otherwise defined",
"default": {
"//": "Page coordinates containing table in pts",
"//": "[top, left, bottom, right]",
"area": [143, 58, 760, 546],
"//": "Right x coordinate of each column in table",
"columns": [93, 344, 402, 460, 546]
},
"//": "Layout for first page",
"first": {
"area": [385, 58, 760, 546],
"columns": [93, 344, 402, 460, 546]
}
},

"//": "Statement column names exactly as they appear",
"columns": {
"trans_date": "Date",
"trans_detail": "Transaction",
"debit": "Debit",
"credit": "Credit",
"balance": "Balance"
},

"//": "csv column output order",
"order": [
"trans_date",
"trans_detail",
"debit",
"credit",
"balance"
],

"//": "Specify required cleaning operations",
"cleaning": {
"//": "Remove key column transactions using value",
"prestrip": ["trans_detail", "(?i)(^Balance.*Forward$)|(Closing Balance)"],
"//": "Convert these columns to numeric",
"numeric": ["debit", "credit", "balance"],
"//": "Convert these columns to date",
"date": ["trans_date"],
"//": "Use this date format to parse any date columns",
"date_format": "%d %b",
"trans_detail": "below",
"//": "Key column and value column to unwrap",
"unwrap": ["trans_date", "trans_detail"],
"//": "Change to Title Case",
"case": ["trans_detail"],
"//": "Only keep the rows where these columns are populated",
"dropna": ["trans_date"]
}
}
117 changes: 88 additions & 29 deletions pdf_statement_reader/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from pikepdf import Pdf
import pandas as pd
import numpy as np
import re
import logging


def get_raw_df(filename, num_pages, config):
Expand All @@ -22,18 +24,43 @@ def get_raw_df(filename, num_pages, config):
columns=columns,
stream=True,
guess=False,
pandas_options={"dtype": str},
pandas_options={"dtype": str, "header": None},
flywire marked this conversation as resolved.
Show resolved Hide resolved
java_options=[
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog"
]
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
],
)
if df is not None and len(df) > 0:
dfs.extend(df)

if config["layout"]["pandas_options"]["header"] == "None":
for df in dfs:
df.columns = [config["columns"][col] for col in config["order"]]
statement = pd.concat(dfs, sort=False).reset_index(drop=True)
return statement


def clean_prestrip(df, config):
key = config["columns"][config["cleaning"]["prestrip"][0]]
value = config["cleaning"]["prestrip"][1]
df.dropna(subset=[key], inplace=True)
# df.ix[:, ~df[key].str.match(value)== True]
df = df[~df[key].str.match(value) == True]
return df


def format_currency_number(s):
DECIMAL_SEPARATOR = "."
re_real = "[^\d" + DECIMAL_SEPARATOR + "]+"
re_negative = "(?i)(^-|DR)|(-|DR$)"
s = str(s)
flag_negative = True if bool(re.search(re_negative, s)) else False
s = re.sub(re_real, "", s)
if flag_negative:
s = "-" + s
return s


def format_negatives(s):
s = str(s)
if s.endswith("-"):
Expand All @@ -45,29 +72,9 @@ def format_negatives(s):
def clean_numeric(df, config):
numeric_cols = [config["columns"][col] for col in config["cleaning"]["numeric"]]


for col in numeric_cols:
df[col] = df[col].apply(format_negatives)
df[col] = df[col].str.replace(" ", "")
df[col] = pd.to_numeric(
df[col],
errors="coerce"
)


def clean_date(df, config):
date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
if "date_format" in config["cleaning"]:
date_format = config["cleaning"]["date_format"]
else:
date_format = None

for col in date_cols:
df[col] = pd.to_datetime(
df[col],
errors="coerce",
format=date_format
)
df[col] = df[col].apply(format_currency_number)
df[col] = pd.to_numeric(df[col], errors="coerce")


def clean_trans_detail(df, config):
Expand All @@ -84,6 +91,42 @@ def clean_trans_detail(df, config):
df.loc[i - 1, trans_detail] = row[trans_type]


def clean_date(df, config):
date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
if "date_format" in config["cleaning"]:
date_format = config["cleaning"]["date_format"]
else:
date_format = None

cba = False # json setting needed
flywire marked this conversation as resolved.
Show resolved Hide resolved
if cba:
year = df.iloc[0, 1][0:4]
date_format += " %Y"
for col in date_cols:
if cba:
df[col] += " " + year
df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format)


def clean_unwrap(df, config):
key = config["columns"][config["cleaning"]["unwrap"][0]]
val = config["columns"][config["cleaning"]["unwrap"][1]]
j = 0
for i, row in df.iterrows():
if pd.isnull(row[key]):
if pd.notna(df.loc[i, val]):
df.loc[j, val] += " " + df.loc[i, val]
else:
j = i


def clean_case(df, config):
cols = [config["columns"][col] for col in config["cleaning"]["case"]]
for col in cols:
df[col] = df[col].str.title()
return df


def clean_dropna(df, config):
drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]]
df.dropna(subset=drop_cols, inplace=True)
Expand All @@ -95,23 +138,39 @@ def reorder_columns(df, config):


def parse_statement(filename, config):
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger()
pdf = Pdf.open(filename)
num_pages = len(pdf.pages)

statement = get_raw_df(filename, num_pages, config)
logging.debug(statement)

logging.debug("**" + "prestrip")
if "prestrip" in config["cleaning"]:
statement = clean_prestrip(statement, config)

logging.debug("**" + "numeric")
if "numeric" in config["cleaning"]:
clean_numeric(statement, config)

if "trans_detail" in config["cleaning"]:
clean_trans_detail(statement, config)

logging.debug("**" + "date")
if "date" in config["cleaning"]:
clean_date(statement, config)


logging.debug("**" + "unwrap")
if "unwrap" in config["cleaning"]:
clean_unwrap(statement, config)

logging.debug("**" + "case")
if "case" in config["cleaning"]:
statement = clean_case(statement, config)

logging.debug("**" + "dropna")
if "dropna" in config["cleaning"]:
clean_dropna(statement, config)

logging.debug("**" + "order")
if "order" in config:
statement = reorder_columns(statement, config)

Expand Down
68 changes: 68 additions & 0 deletions tests/test_parse_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
from pdf_statement_reader.parse import format_negatives
from pdf_statement_reader.parse import format_currency_number
from pdf_statement_reader.parse import clean_prestrip
# from pdf_statement_reader.parse import clean_numeric
# from pdf_statement_reader.parse import clean_trans_detail
from pdf_statement_reader.parse import clean_unwrap
# from pdf_statement_reader.parse import clean_date
from pdf_statement_reader.parse import clean_case
# from pdf_statement_reader.parse import clean_dropna
# from pdf_statement_reader.parse import reorder_columns


def test_format_negatives():
assert format_negatives(123.45) == "123.45"
Expand All @@ -8,3 +18,61 @@ def test_format_negatives():
assert format_negatives("-123.45") == "-123.45"
assert format_negatives("0") == "0"
assert format_negatives("123.45-") == "-123.45"


def test_format_currency_number():
assert format_currency_number('123.45') == "123.45"
assert format_currency_number('$123.45') == "123.45"
assert format_currency_number('$123.45 CR') == "123.45"
assert format_currency_number('-1,234.56') == "-1234.56"
assert format_currency_number('1,234.56-') == "-1234.56"
assert format_currency_number('1,234.56 DR') == "-1234.56"
assert format_currency_number('-$1,234.56 dr') == "-1234.56"
assert format_currency_number('0') == "0"
assert format_currency_number('-1') == "-1"
assert format_currency_number('.12') == ".12"
assert format_currency_number('“1”') == "1"


def test_clean_prestrip():

_df = pd.DataFrame([['Field1':["",""],
'Faction':["Test String", "Another test"]])
_config = {'$schema': '',
'columns': {'F1': 'Field1', 'Key': 'Faction'},
'cleaning': {'prestrip': ['Key', 'test']}
}

df = _df
config = _config
assert clean_prestrip(df, config) == [['Field1':["","Test String"]]

df = _df
config = _config
assert clean_prestrip(df, config) == []


def test_clean_unwrap():
_df = pd.DataFrame({['Date':["01/01/2020","","","26/05/2020"],
'Faction':["Test String", "Another test","Short", "Last bit."]})
_config = {'$schema': '',
'columns': {'Key': 'Date', 'F2': 'Faction'},
'cleaning': {'unwrap': ['Key', 'F2']}
}

df = _df
config = _config
assert clean_unwrap(df, config) == [['Date':["01/01/2020","","","26/05/2020"],
'Faction':["Test String Another test Short", "Another test","Short", "Last bit."]]


def test_clean_case():
_df = pd.DataFrame({['Faction':["test string", "ANOTHER TEst","shORT", "last Bit."]})
_config = {'$schema': '',
'columns': {'Key': 'Date', 'F2': 'Faction'},
'cleaning': {'unwrap': ['Key', 'F2']}
}

df = _df
config = _config
assert clean_case(df, config) == ['Faction':["Test String", "Another Tst","Short", "Last Bit."]