Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced parsing functionality #50

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion pdf_statement_reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def pdf2csv(input_filename, output_filename=None, config_spec=None):
output_filename = input_filename.split(".pdf")[0] + ".csv"

df = parse_statement(input_filename, config)
df.to_csv(output_filename, index=False, float_format="%.2f")
df.to_csv(output_filename, index=False, float_format="%.2f", date_format='%d/%m/%y')
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest json config support optional second date format for output

click.echo("Converted {} and saved as {}".format(input_filename, output_filename))


Expand Down
56 changes: 56 additions & 0 deletions pdf_statement_reader/config/cba/saving.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
"//": "Describe layout for pages to be scanned",
"layout": {
"//": "Default layout for all pages not otherwise defined",
"default": {
"//": "Page coordinates containing table in pts",
"//": "[top, left, bottom, right]",
"area": [143, 58, 760, 546],
"//": "Right x coordinate of each column in table",
"columns": [93, 344, 402, 460, 546]
},
"//": "Layout for first page",
"first": {
"area": [385, 58, 760, 546],
"columns": [93, 344, 402, 460, 546]
}
},

"//": "Statement column names exactly as they appear",
"columns": {
"trans_date": "Date",
"trans_detail": "Transaction",
"debit": "Debit",
"credit": "Credit",
"balance": "Balance"
},

"//": "csv column output order",
"order": [
"trans_date",
"trans_detail",
"debit",
"credit",
"balance"
],

"//": "Specify required cleaning operations",
"cleaning": {
"//": "Remove key column transactions using value",
"prestrip": ["trans_detail", "(?i)(^Balance.*Forward$)|(Closing Balance)"],
"//": "Convert these columns to numeric",
"numeric": ["debit", "credit", "balance"],
"//": "Convert these columns to date",
"date": ["trans_date"],
"//": "Use this date format to parse any date columns",
"date_format": "%d %b",
"trans_detail": "below",
"//": "Key column and value column to unwrap",
"unwrap": ["trans_date", "trans_detail"],
"//": "Change to Title Case",
"case": ["trans_detail"],
"//": "Only keep the rows where these columns are populated",
"dropna": ["trans_date"]
}
}
62 changes: 62 additions & 0 deletions pdf_statement_reader/config/citi/cheque.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
"//": "Describe layout for pages to be scanned",
"layout": {
"//": "Default layout for all pages not otherwise defined",
"default": {
"//": "Page coordinates containing table in pts",
"//": "[top, left, bottom, right]",
"area": [190, 47, 800, 558],
"//": "Right x coordinate of each column in table",
"columns": [105, 275, 340, 444, 558]
},
"//": "Layout for first page",
"first": {
"area": [530, 47, 800, 558],
"columns": [105, 275, 340, 444, 558]
},
"//": "Layout for no text-based header",
"pandas_options": {
"header": "None"
}
},

"//": "Statement column names exactly as they appear",
"columns": {
"trans_date": "Date",
"trans_detail": "Description",
"debit": "Debit",
"credit": "Credit",
"balance": "Balance"
},

"//": "csv column output order",
"order": [
"trans_date",
"trans_detail",
"debit",
"credit",
"balance"
],

"//": "Specify required cleaning operations",
"cleaning": {
"//": "Truncate from key column transactions using value",
"truncate": ["trans_detail", "CLOSING BALANCE"],
"//": "Remove key column transactions using value",
"prestrip": ["trans_detail", "(?i)(.*Balance$)|(Tota)"],
"//": "Convert these columns to numeric",
"numeric": ["debit", "credit", "balance"],
"//": "Convert these columns to date",
"date": ["trans_date"],
"//": "Use this date format to parse any date columns",
"date_format": "%d %b %Y",
"trans_detail": "below",
"//": "Key column and value column to unwrap",
"unwrap": ["trans_date", "trans_detail"],
"//": "Change to Title Case",
"case": ["trans_detail"],
"//": "Only keep the rows where these columns are populated",
"dropna": ["trans_date"]
}
}
150 changes: 118 additions & 32 deletions pdf_statement_reader/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
from pikepdf import Pdf
import pandas as pd
import numpy as np
import re
import logging


def get_raw_df(filename, num_pages, config):
dfs = []
_pandas_options = {"dtype": str}
header = True
if config["layout"].get("pandas_options"):
_pandas_options.update(config["layout"].get("pandas_options"))
if (
config["layout"]["pandas_options"].get("header")
and config["layout"]["pandas_options"].get("header") == "None"
):
header = False

for i in range(num_pages):
if i == 0 and "first" in config["layout"]:
Expand All @@ -22,18 +33,51 @@ def get_raw_df(filename, num_pages, config):
columns=columns,
stream=True,
guess=False,
pandas_options={"dtype": str},
pandas_options=_pandas_options,
java_options=[
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog"
]
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
],
)
if df is not None and len(df) > 0:
dfs.extend(df)

if not header:
for df in dfs:
df.columns = [config["columns"][col] for col in config["order"]]
statement = pd.concat(dfs, sort=False).reset_index(drop=True)
return statement


def clean_truncate(df, config):
key = config["columns"][config["cleaning"]["truncate"][0]]
value = config["cleaning"]["truncate"][1]
if not df[df[key] == value].empty:
df = df.iloc[: df[df[key] == value].index[0]]
return df


def clean_prestrip(df, config):
key = config["columns"][config["cleaning"]["prestrip"][0]]
value = config["cleaning"]["prestrip"][1]
df.dropna(subset=[key], inplace=True)
# df.ix[:, ~df[key].str.match(value)== True]
df = df[~df[key].str.match(value) == True]
return df


def format_currency_number(s):
DECIMAL_SEPARATOR = "."
re_real = "[^\d" + DECIMAL_SEPARATOR + "]+"
re_negative = "(?i)(^-|DR)|(-|DR$)"
s = str(s)
flag_negative = True if bool(re.search(re_negative, s)) else False
s = re.sub(re_real, "", s)
if flag_negative:
s = "-" + s
return s


def format_negatives(s):
s = str(s)
if s.endswith("-"):
Expand All @@ -45,29 +89,10 @@ def format_negatives(s):
def clean_numeric(df, config):
numeric_cols = [config["columns"][col] for col in config["cleaning"]["numeric"]]


for col in numeric_cols:
df[col] = df[col].apply(format_negatives)
df[col] = df[col].str.replace(" ", "")
df[col] = pd.to_numeric(
df[col],
errors="coerce"
)


def clean_date(df, config):
date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
if "date_format" in config["cleaning"]:
date_format = config["cleaning"]["date_format"]
else:
date_format = None

for col in date_cols:
df[col] = pd.to_datetime(
df[col],
errors="coerce",
format=date_format
)
df[col] = df[col].apply(format_currency_number)
df[col] = pd.to_numeric(df[col], errors="coerce")
return df


def clean_trans_detail(df, config):
Expand All @@ -82,11 +107,52 @@ def clean_trans_detail(df, config):
continue
if np.isnan(row[balance]):
df.loc[i - 1, trans_detail] = row[trans_type]
return df


def clean_date(df, config):
date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
if "date_format" in config["cleaning"]:
date_format = config["cleaning"]["date_format"]
else:
date_format = None
if date_format == "%d %b":
no_year = True
year = df.iloc[0, 1][0:4]
date_format += " %Y"
else:
no_year = False
for col in date_cols:
if no_year:
df[col] += " " + year
df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format)
return df


def clean_unwrap(df, config):
key = config["columns"][config["cleaning"]["unwrap"][0]]
val = config["columns"][config["cleaning"]["unwrap"][1]]
j = 0
for i, row in df.iterrows():
if pd.isnull(row[key]):
if pd.notna(df.loc[i, val]):
df.loc[j, val] += " " + df.loc[i, val]
else:
j = i
return df


def clean_case(df, config):
cols = [config["columns"][col] for col in config["cleaning"]["case"]]
for col in cols:
df[col] = df[col].str.title()
return df


def clean_dropna(df, config):
drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]]
df.dropna(subset=drop_cols, inplace=True)
return df


def reorder_columns(df, config):
Expand All @@ -95,24 +161,44 @@ def reorder_columns(df, config):


def parse_statement(filename, config):
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger()
pdf = Pdf.open(filename)
num_pages = len(pdf.pages)

statement = get_raw_df(filename, num_pages, config)
logging.debug(statement)

if "truncate" in config["cleaning"]:
logging.debug("**" + "truncate")
statement = clean_truncate(statement, config)

if "prestrip" in config["cleaning"]:
logging.debug("**" + "prestrip")
statement = clean_prestrip(statement, config)

if "numeric" in config["cleaning"]:
clean_numeric(statement, config)
logging.debug("**" + "numeric")
statement = clean_numeric(statement, config)

if "trans_detail" in config["cleaning"]:
clean_trans_detail(statement, config)

if "date" in config["cleaning"]:
clean_date(statement, config)

logging.debug("**" + "date")
statement = clean_date(statement, config)

if "unwrap" in config["cleaning"]:
logging.debug("**" + "unwrap")
statement = clean_unwrap(statement, config)

if "case" in config["cleaning"]:
logging.debug("**" + "case")
statement = clean_case(statement, config)

if "dropna" in config["cleaning"]:
clean_dropna(statement, config)
logging.debug("**" + "dropna")
statement = clean_dropna(statement, config)

if "order" in config:
logging.debug("**" + "order")
statement = reorder_columns(statement, config)

return statement
Loading