marlanperumal · flywire · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
diff --git a/pdf_statement_reader/__init__.py b/pdf_statement_reader/__init__.py
@@ -78,7 +78,7 @@ def pdf2csv(input_filename, output_filename=None, config_spec=None):
         output_filename = input_filename.split(".pdf")[0] + ".csv"
 
     df = parse_statement(input_filename, config)
-    df.to_csv(output_filename, index=False, float_format="%.2f")
+    df.to_csv(output_filename, index=False, float_format="%.2f", date_format='%d/%m/%y')
     click.echo("Converted {} and saved as {}".format(input_filename, output_filename))
 
 

diff --git a/pdf_statement_reader/config/cba/saving.json b/pdf_statement_reader/config/cba/saving.json
@@ -0,0 +1,56 @@
+{
+    "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
+    "//": "Describe layout for pages to be scanned",
+    "layout": { 
+        "//": "Default layout for all pages not otherwise defined",
+        "default": {
+            "//": "Page coordinates containing table in pts",
+            "//": "[top, left, bottom, right]",
+            "area": [143, 58, 760, 546],
+            "//": "Right x coordinate of each column in table",
+            "columns": [93, 344, 402, 460, 546]
+        },
+        "//": "Layout for first page",
+        "first": {
+            "area": [385, 58, 760, 546],
+            "columns": [93, 344, 402, 460, 546]
+        }
+    },
+
+    "//": "Statement column names exactly as they appear",
+    "columns": {
+        "trans_date": "Date",
+        "trans_detail": "Transaction",
+        "debit": "Debit",
+        "credit": "Credit",
+        "balance": "Balance"
+    },
+
+    "//": "csv column output order",
+    "order": [
+        "trans_date",
+        "trans_detail",
+        "debit",
+        "credit",
+        "balance"
+    ],
+
+    "//": "Specify required cleaning operations",
+    "cleaning": {
+        "//": "Remove key column transactions using value",
+        "prestrip": ["trans_detail", "(?i)(^Balance.*Forward$)|(Closing Balance)"],
+        "//": "Convert these columns to numeric",
+        "numeric": ["debit", "credit", "balance"],
+        "//": "Convert these columns to date",
+        "date": ["trans_date"],
+        "//": "Use this date format to parse any date columns",
+        "date_format": "%d %b",
+        "trans_detail": "below",
+        "//": "Key column and value column to unwrap",
+        "unwrap": ["trans_date", "trans_detail"],
+        "//": "Change to Title Case",
+        "case": ["trans_detail"],
+        "//": "Only keep the rows where these columns are populated",
+        "dropna": ["trans_date"]
+    }
+}
diff --git a/pdf_statement_reader/config/citi/cheque.json b/pdf_statement_reader/config/citi/cheque.json
@@ -0,0 +1,62 @@
+{
+    "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
+    "//": "Describe layout for pages to be scanned",
+    "layout": { 
+        "//": "Default layout for all pages not otherwise defined",
+        "default": {
+            "//": "Page coordinates containing table in pts",
+            "//": "[top, left, bottom, right]",
+            "area": [190, 47, 800, 558],
+            "//": "Right x coordinate of each column in table",
+            "columns": [105, 275, 340, 444, 558]
+        },
+        "//": "Layout for first page",
+        "first": {
+            "area": [530, 47, 800, 558],
+            "columns": [105, 275, 340, 444, 558]
+        },
+        "//": "Layout for no text-based header",
+        "pandas_options": {
+            "header": "None"
+        }
+    },
+
+    "//": "Statement column names exactly as they appear",
+    "columns": {
+        "trans_date": "Date",
+        "trans_detail": "Description",
+        "debit": "Debit",
+        "credit": "Credit",
+        "balance": "Balance"
+    },
+
+    "//": "csv column output order",
+    "order": [
+        "trans_date",
+        "trans_detail",
+        "debit",
+        "credit",
+        "balance"
+    ],
+
+    "//": "Specify required cleaning operations",
+    "cleaning": {
+        "//": "Truncate from key column transactions using value",
+        "truncate": ["trans_detail", "CLOSING BALANCE"],
+        "//": "Remove key column transactions using value",
+        "prestrip": ["trans_detail", "(?i)(.*Balance$)|(Tota)"],
+        "//": "Convert these columns to numeric",
+        "numeric": ["debit", "credit", "balance"],
+        "//": "Convert these columns to date",
+        "date": ["trans_date"],
+        "//": "Use this date format to parse any date columns",
+        "date_format": "%d %b %Y",
+        "trans_detail": "below",
+        "//": "Key column and value column to unwrap",
+        "unwrap": ["trans_date", "trans_detail"],
+        "//": "Change to Title Case",
+        "case": ["trans_detail"],
+        "//": "Only keep the rows where these columns are populated",
+        "dropna": ["trans_date"]
+    }
+}
diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py
@@ -2,10 +2,21 @@
 from pikepdf import Pdf
 import pandas as pd
 import numpy as np
+import re
+import logging
 
 
 def get_raw_df(filename, num_pages, config):
     dfs = []
+    _pandas_options = {"dtype": str}
+    header = True
+    if config["layout"].get("pandas_options"):
+        _pandas_options.update(config["layout"].get("pandas_options"))
+        if (
+            config["layout"]["pandas_options"].get("header")
+            and config["layout"]["pandas_options"].get("header") == "None"
+        ):
+            header = False
 
     for i in range(num_pages):
         if i == 0 and "first" in config["layout"]:
@@ -22,18 +33,51 @@ def get_raw_df(filename, num_pages, config):
             columns=columns,
             stream=True,
             guess=False,
-            pandas_options={"dtype": str},
+            pandas_options=_pandas_options,
             java_options=[
                 "-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
-                "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog"
-            ]
+                "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
+            ],
         )
         if df is not None and len(df) > 0:
             dfs.extend(df)
+
+    if not header:
+        for df in dfs:
+            df.columns = [config["columns"][col] for col in config["order"]]
     statement = pd.concat(dfs, sort=False).reset_index(drop=True)
     return statement
 
 
+def clean_truncate(df, config):
+    key = config["columns"][config["cleaning"]["truncate"][0]]
+    value = config["cleaning"]["truncate"][1]
+    if not df[df[key] == value].empty:
+        df = df.iloc[: df[df[key] == value].index[0]]
+    return df
+
+
+def clean_prestrip(df, config):
+    key = config["columns"][config["cleaning"]["prestrip"][0]]
+    value = config["cleaning"]["prestrip"][1]
+    df.dropna(subset=[key], inplace=True)
+    # df.ix[:, ~df[key].str.match(value)== True]
+    df = df[~df[key].str.match(value) == True]
+    return df
+
+
+def format_currency_number(s):
+    DECIMAL_SEPARATOR = "."
+    re_real = "[^\d" + DECIMAL_SEPARATOR + "]+"
+    re_negative = "(?i)(^-|DR)|(-|DR$)"
+    s = str(s)
+    flag_negative = True if bool(re.search(re_negative, s)) else False
+    s = re.sub(re_real, "", s)
+    if flag_negative:
+        s = "-" + s
+    return s
+
+
 def format_negatives(s):
     s = str(s)
     if s.endswith("-"):
@@ -45,29 +89,10 @@ def format_negatives(s):
 def clean_numeric(df, config):
     numeric_cols = [config["columns"][col] for col in config["cleaning"]["numeric"]]
 
-
     for col in numeric_cols:
-        df[col] = df[col].apply(format_negatives)
-        df[col] = df[col].str.replace(" ", "")
-        df[col] = pd.to_numeric(
-            df[col],
-            errors="coerce"
-        )
-
-
-def clean_date(df, config):
-    date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
-    if "date_format" in config["cleaning"]:
-        date_format = config["cleaning"]["date_format"]
-    else:
-        date_format = None
-
-    for col in date_cols:
-        df[col] = pd.to_datetime(
-            df[col],
-            errors="coerce",
-            format=date_format
-        )
+        df[col] = df[col].apply(format_currency_number)
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    return df
 
 
 def clean_trans_detail(df, config):
@@ -82,11 +107,52 @@ def clean_trans_detail(df, config):
             continue
         if np.isnan(row[balance]):
             df.loc[i - 1, trans_detail] = row[trans_type]
+    return df
+
+
+def clean_date(df, config):
+    date_cols = [config["columns"][col] for col in config["cleaning"]["date"]]
+    if "date_format" in config["cleaning"]:
+        date_format = config["cleaning"]["date_format"]
+    else:
+        date_format = None
+    if date_format == "%d %b":
+        no_year = True
+        year = df.iloc[0, 1][0:4]
+        date_format += " %Y"
+    else:
+        no_year = False
+    for col in date_cols:
+        if no_year:
+            df[col] += " " + year
+        df[col] = pd.to_datetime(df[col], errors="coerce", format=date_format)
+    return df
+
+
+def clean_unwrap(df, config):
+    key = config["columns"][config["cleaning"]["unwrap"][0]]
+    val = config["columns"][config["cleaning"]["unwrap"][1]]
+    j = 0
+    for i, row in df.iterrows():
+        if pd.isnull(row[key]):
+            if pd.notna(df.loc[i, val]):
+                df.loc[j, val] += " " + df.loc[i, val]
+        else:
+            j = i
+    return df
+
+
+def clean_case(df, config):
+    cols = [config["columns"][col] for col in config["cleaning"]["case"]]
+    for col in cols:
+        df[col] = df[col].str.title()
+    return df
 
 
 def clean_dropna(df, config):
     drop_cols = [config["columns"][col] for col in config["cleaning"]["dropna"]]
     df.dropna(subset=drop_cols, inplace=True)
+    return df
 
 
 def reorder_columns(df, config):
@@ -95,24 +161,44 @@ def reorder_columns(df, config):
 
 
 def parse_statement(filename, config):
+    logging.basicConfig(level=logging.ERROR)
+    logger = logging.getLogger()
     pdf = Pdf.open(filename)
     num_pages = len(pdf.pages)
 
     statement = get_raw_df(filename, num_pages, config)
+    logging.debug(statement)
+
+    if "truncate" in config["cleaning"]:
+        logging.debug("**" + "truncate")
+        statement = clean_truncate(statement, config)
+
+    if "prestrip" in config["cleaning"]:
+        logging.debug("**" + "prestrip")
+        statement = clean_prestrip(statement, config)
 
     if "numeric" in config["cleaning"]:
-        clean_numeric(statement, config)
+        logging.debug("**" + "numeric")
+        statement = clean_numeric(statement, config)
 
-    if "trans_detail" in config["cleaning"]:
-        clean_trans_detail(statement, config)
-
     if "date" in config["cleaning"]:
-        clean_date(statement, config)
-
+        logging.debug("**" + "date")
+        statement = clean_date(statement, config)
+
+    if "unwrap" in config["cleaning"]:
+        logging.debug("**" + "unwrap")
+        statement = clean_unwrap(statement, config)
+
+    if "case" in config["cleaning"]:
+        logging.debug("**" + "case")
+        statement = clean_case(statement, config)
+
     if "dropna" in config["cleaning"]:
-        clean_dropna(statement, config)
+        logging.debug("**" + "dropna")
+        statement = clean_dropna(statement, config)
 
     if "order" in config:
+        logging.debug("**" + "order")
         statement = reorder_columns(statement, config)
 
     return statement