refactor: modularize code

MichaelVerdegaal · Sep 15, 2024 · 6ed6de6 · 6ed6de6
1 parent 94b7953
commit 6ed6de6
Show file tree

Hide file tree

Showing 7 changed files with 479 additions and 103 deletions.
diff --git a/categories.txt b/categories.txt
@@ -0,0 +1,10 @@
+House payments
+Utilities
+Food
+Transport
+Health
+Subscriptions
+Insurance
+Entertainment
+Other
+Unknown
diff --git a/config.py b/config.py
@@ -1,6 +1,10 @@
 """
 Generic config variables
 """
+
 from pathlib import Path
 
 PROJECT_FOLDER = Path(__file__).parent.resolve()
+
+OUTPUT_FOLDER = PROJECT_FOLDER / "output"
+OUTPUT_FOLDER.mkdir(exist_ok=True)
diff --git a/main.py b/main.py
@@ -1,62 +1,35 @@
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-# Load datai
-df = pd.read_csv("data/NL38INGB0001546874_01-01-2024_18-08-2024.csv",
-                 sep=";")
-df = df[["Date", "Name / Description", "Amount (EUR)", "Transaction type", "Notifications"]]
-
-# Rename
-df = df.rename(columns={"Date": "date",
-                        "Name / Description": "description",
-                        "Amount (EUR)": "amount",
-                        "Transaction type": "category",
-                        "Notifications": "extra"})
-
-# Set date format
-df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
-# To numeric
-df["amount"] = df["amount"].str.replace(",", ".")
-df["amount"] = pd.to_numeric(df["amount"])
-
-# Get unique transaction categories
-categories = df["category"].unique().tolist()
-print(categories)
-
-# Split
-plt.pie(df.groupby("category")["amount"].sum(), labels=categories, autopct='%.0f%%')
-plt.show()
-
-# Categorize
-"""
-Online Banking = Moving between my accounts | Manual payments to other accounts
-Batch payment = Retour. Only 1 instance
-Cash machine = Cash withdrawal (ATM)
-Deposit = Cash deposit (ATM)
-Transfer = Rounding feature from savings | Retour
-SEPA direct debit = Automatic debit payments
-Various = ING account payments | Credit card repayment
-Payment terminal = Payments at card machine
-iDEAL = Payments via iDEAL 
-
-____________________________________________________________________
-
-Drop:
-
-- Batch payment
-- Deposit
-- Transfer
-
-Split:
-- Online banking --> Remove if extra contains 'From Oranje spaarrekening'
-
-Keep:
-- Online banking
-- SEPA direct debit
-- Various
-- Payment terminal
-- iDEAL
-- Cash machine
-"""
+import json
+
+from config import PROJECT_FOLDER, OUTPUT_FOLDER
+from main.load import ing_loader
+from transformers import pipeline
+
+# Load and preprocess data
+df = ing_loader(
+    PROJECT_FOLDER / "data" / "NL38INGB0001546874_01-01-2024_18-08-2024.csv"
+)
+
+# Set up zero-shot classifier
+zeroshot_classifier = pipeline(
+    "zero-shot-classification",
+    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
+    device=0,
+)
+
+# Read budget categories from categories.txt
+with open(PROJECT_FOLDER / "categories.txt", "r") as f:
+    budget_categories = [line.strip() for line in f.readlines()]
+
+# Set up json file to store transaction classifications
+json_file_path = OUTPUT_FOLDER / "transaction_classifications.json"
+
+# Initialize an empty dictionary to store classifications
+transaction_classifications = {}
+
+# Load existing classifications if the file exists
+if json_file_path.exists():
+    with json_file_path.open("r") as json_file:
+        transaction_classifications = json.load(json_file)
+
+# Iterate over each row
 ...
diff --git a/main/load.py b/main/load.py
@@ -0,0 +1,73 @@
+"""
+Contains loaders for different data sources (file only)
+
+Output will result in unified dataframe:
+- date
+- account_name
+- account_number
+- amount
+- description
+- category (if available)
+"""
+
+import pandas as pd
+
+
+def ing_loader(file_path):
+    """Loader for ING bank transaction export
+
+    Online Banking = Moving between accounts | Manual payments to other accounts
+    Batch payment = Retour. Only 1 instance
+    Cash machine = Cash withdrawal (ATM)
+    Deposit = Cash deposit (ATM)
+    Transfer = Rounding feature from savings | Retour
+    SEPA direct debit = Automatic debit payments
+    Various = ING account payments | Credit card repayment
+    Payment terminal = Payments at card machine
+    iDEAL = Payments via iDEAL
+    ____________________________________________________________________
+    Drop:
+    - Batch payment
+    - Deposit
+    - Transfer
+
+    Split:
+    - Online banking --> Remove if extra contains 'From Oranje spaarrekening'
+    """
+    # Load and preprocess data
+    df = pd.read_csv(file_path, sep=";")
+    df = df[
+        [
+            "Date",
+            "Name / Description",
+            "Account",
+            "Amount (EUR)",
+            "Transaction type",
+            "Notifications",
+        ]
+    ]
+
+    # Rename columns for consistency and clarity
+    df = df.rename(
+        columns={
+            "Date": "date",
+            "Name / Description": "account_name",
+            "Account": "account_number",
+            "Amount (EUR)": "amount",
+            "Transaction type": "category",
+            "Notifications": "description",
+        }
+    )
+
+    # Convert date and amount columns
+    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
+    df["amount"] = pd.to_numeric(df["amount"].str.replace(",", "."))
+
+    # Filter transactions
+    df = df[~df["category"].isin(["Batch payment", "Deposit", "Transfer"])]
+    mask = (df["category"] == "Online Banking") & (
+        df["extra"].str.contains("Oranje spaarrekening", case=False, na=False)
+    )
+    df = df[~mask]
+
+    return df
diff --git a/main/main.ipynb b/main/main.ipynb