Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced parsing functionality #50

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
61 changes: 39 additions & 22 deletions pdf_statement_reader/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,35 @@ def get_raw_df(filename, num_pages, config):
columns=columns,
stream=True,
guess=False,
pandas_options={"dtype": str},
pandas_options={"dtype": str, "header": None},
flywire marked this conversation as resolved.
Show resolved Hide resolved
java_options=[
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
],
)
if df is not None and len(df) > 0:
dfs.extend(df)

if config["layout"]["pandas_options"]["header"] == "None":
for df in dfs:
df.columns = [config["columns"][col] for col in config["order"]]
statement = pd.concat(dfs, sort=False).reset_index(drop=True)
return statement


def clean_prestrip(df, config):
key = config["columns"][config["cleaning"]["prestrip"][0]]
value = config["cleaning"]["prestrip"][1]
df.dropna(subset=[key], inplace=True)
# df.ix[:, ~df[key].str.match(value)== True]
df = df[~df[key].str.match(value) == True]
return df


def format_currency_number(s):
decimal_separator = "."
re_real = "[^\d" + decimal_separator + "]+"
re_negative = "(^-|(?i)DR)|(-|(?i)DR$)"
DECIMAL_SEPARATOR = "."
re_real = "[^\d" + DECIMAL_SEPARATOR + "]+"
re_negative = "(?i)(^-|DR)|(-|DR$)"
s = str(s)
flag_negative = True if bool(re.search(re_negative, s)) else False
s = re.sub(re_real, "", s)
Expand Down Expand Up @@ -86,28 +91,33 @@ def clean_trans_detail(df, config):
df.loc[i - 1, trans_detail] = row[trans_type]


def clean_unwrap(df, config):
key = config["columns"][config["cleaning"]["unwrap"][0]]
value = config["columns"][config["cleaning"]["unwrap"][1]]
j = 0
for i, row in df.iterrows():
if pd.isnull(row[key]):
df.loc[j, value] += " " + df.loc[i, value]
else:
j = i


def clean_date(df, config):
date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
if "date_format" in config["cleaning"]:
date_format = config["cleaning"]["date_format"]
else:
date_format = None

year = df.iloc[0, 1][0:4]
cba = False # json setting needed
flywire marked this conversation as resolved.
Show resolved Hide resolved
if cba:
year = df.iloc[0, 1][0:4]
date_format += " %Y"
for col in date_cols:
df[col] += " " + year
df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format + " %Y")
if cba:
df[col] += " " + year
df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format)


def clean_unwrap(df, config):
key = config["columns"][config["cleaning"]["unwrap"][0]]
val = config["columns"][config["cleaning"]["unwrap"][1]]
j = 0
for i, row in df.iterrows():
if pd.isnull(row[key]):
if pd.notna(df.loc[i, val]):
df.loc[j, val] += " " + df.loc[i, val]
else:
j = i


def clean_case(df, config):
Expand All @@ -134,26 +144,33 @@ def parse_statement(filename, config):
num_pages = len(pdf.pages)

statement = get_raw_df(filename, num_pages, config)
logging.debug(statement["Transaction"])
logging.debug(statement)

logging.debug("**" + "prestrip")
if "prestrip" in config["cleaning"]:
statement = clean_prestrip(statement, config)

logging.debug("**" + "numeric")
if "numeric" in config["cleaning"]:
clean_numeric(statement, config)

if "unwrap" in config["cleaning"]:
clean_unwrap(statement, config)

logging.debug("**" + "date")
if "date" in config["cleaning"]:
clean_date(statement, config)

logging.debug("**" + "unwrap")
if "unwrap" in config["cleaning"]:
clean_unwrap(statement, config)

logging.debug("**" + "case")
if "case" in config["cleaning"]:
statement = clean_case(statement, config)

logging.debug("**" + "dropna")
if "dropna" in config["cleaning"]:
clean_dropna(statement, config)

logging.debug("**" + "order")
if "order" in config:
statement = reorder_columns(statement, config)

Expand Down