Skip to content

Commit

Permalink
refactor: modularize code
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVerdegaal committed Sep 15, 2024
1 parent 94b7953 commit 6ed6de6
Show file tree
Hide file tree
Showing 7 changed files with 479 additions and 103 deletions.
10 changes: 10 additions & 0 deletions categories.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
House payments
Utilities
Food
Transport
Health
Subscriptions
Insurance
Entertainment
Other
Unknown
4 changes: 4 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""
Generic config variables
"""

from pathlib import Path

PROJECT_FOLDER = Path(__file__).parent.resolve()

OUTPUT_FOLDER = PROJECT_FOLDER / "output"
OUTPUT_FOLDER.mkdir(exist_ok=True)
95 changes: 34 additions & 61 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,35 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datai
df = pd.read_csv("data/NL38INGB0001546874_01-01-2024_18-08-2024.csv",
sep=";")
df = df[["Date", "Name / Description", "Amount (EUR)", "Transaction type", "Notifications"]]

# Rename
df = df.rename(columns={"Date": "date",
"Name / Description": "description",
"Amount (EUR)": "amount",
"Transaction type": "category",
"Notifications": "extra"})

# Set date format
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
# To numeric
df["amount"] = df["amount"].str.replace(",", ".")
df["amount"] = pd.to_numeric(df["amount"])

# Get unique transaction categories
categories = df["category"].unique().tolist()
print(categories)

# Split
plt.pie(df.groupby("category")["amount"].sum(), labels=categories, autopct='%.0f%%')
plt.show()

# Categorize
"""
Online Banking = Moving between my accounts | Manual payments to other accounts
Batch payment = Retour. Only 1 instance
Cash machine = Cash withdrawal (ATM)
Deposit = Cash deposit (ATM)
Transfer = Rounding feature from savings | Retour
SEPA direct debit = Automatic debit payments
Various = ING account payments | Credit card repayment
Payment terminal = Payments at card machine
iDEAL = Payments via iDEAL
____________________________________________________________________
Drop:
- Batch payment
- Deposit
- Transfer
Split:
- Online banking --> Remove if extra contains 'From Oranje spaarrekening'
Keep:
- Online banking
- SEPA direct debit
- Various
- Payment terminal
- iDEAL
- Cash machine
"""
import json

from config import PROJECT_FOLDER, OUTPUT_FOLDER
from main.load import ing_loader
from transformers import pipeline

# Load and preprocess data
df = ing_loader(
PROJECT_FOLDER / "data" / "NL38INGB0001546874_01-01-2024_18-08-2024.csv"
)

# Set up zero-shot classifier
zeroshot_classifier = pipeline(
"zero-shot-classification",
model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
device=0,
)

# Read budget categories from categories.txt
with open(PROJECT_FOLDER / "categories.txt", "r") as f:
budget_categories = [line.strip() for line in f.readlines()]

# Set up json file to store transaction classifications
json_file_path = OUTPUT_FOLDER / "transaction_classifications.json"

# Initialize an empty dictionary to store classifications
transaction_classifications = {}

# Load existing classifications if the file exists
if json_file_path.exists():
with json_file_path.open("r") as json_file:
transaction_classifications = json.load(json_file)

# Iterate over each row
...
73 changes: 73 additions & 0 deletions main/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Contains loaders for different data sources (file only)
Output will result in unified dataframe:
- date
- account_name
- account_number
- amount
- description
- category (if available)
"""

import pandas as pd


def ing_loader(file_path):
"""Loader for ING bank transaction export
Online Banking = Moving between accounts | Manual payments to other accounts
Batch payment = Retour. Only 1 instance
Cash machine = Cash withdrawal (ATM)
Deposit = Cash deposit (ATM)
Transfer = Rounding feature from savings | Retour
SEPA direct debit = Automatic debit payments
Various = ING account payments | Credit card repayment
Payment terminal = Payments at card machine
iDEAL = Payments via iDEAL
____________________________________________________________________
Drop:
- Batch payment
- Deposit
- Transfer
Split:
- Online banking --> Remove if extra contains 'From Oranje spaarrekening'
"""
# Load and preprocess data
df = pd.read_csv(file_path, sep=";")
df = df[
[
"Date",
"Name / Description",
"Account",
"Amount (EUR)",
"Transaction type",
"Notifications",
]
]

# Rename columns for consistency and clarity
df = df.rename(
columns={
"Date": "date",
"Name / Description": "account_name",
"Account": "account_number",
"Amount (EUR)": "amount",
"Transaction type": "category",
"Notifications": "description",
}
)

# Convert date and amount columns
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
df["amount"] = pd.to_numeric(df["amount"].str.replace(",", "."))

# Filter transactions
df = df[~df["category"].isin(["Batch payment", "Deposit", "Transfer"])]
mask = (df["category"] == "Online Banking") & (
df["extra"].str.contains("Oranje spaarrekening", case=False, na=False)
)
df = df[~mask]

return df
35 changes: 0 additions & 35 deletions main/main.ipynb

This file was deleted.

Loading

0 comments on commit 6ed6de6

Please sign in to comment.