zjesko · techenik · Jun 30, 2021 · Jun 30, 2021 · Jun 30, 2021 · Jul 1, 2021
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -1,10 +1,13 @@
 name: ci-cd
 
-on: pull_request, push
+# run the action on pull_requests and pushes
+on: [pull_request, push]
 
 jobs:
+  # first job to test the application using pytest
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest # choose the OS for running the action
+    # define the individual sequential steps to be run
     steps:
       - name: Checkout the repository
         uses: actions/checkout@v2
@@ -18,11 +21,16 @@ jobs:
       - name: Run pytest
         run: |
           pytest
-
+
+  # second job to zip the codebase and upload it as an artifact when build succeeds 
   upload_zip:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest # choose the OS for running the action
     needs: build
+
+    # only run this action for pushes
     if: ${{ github.event_name == 'push' }}
+
+    # define the individual sequential steps to be run
     steps:
       - name: Checkout the repository
         uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -0,0 +1,20 @@
+# ML-Ops Demo/Assignment
+
+This repository contains code which demonstrates ML-Ops using a `FastAPI` application which predicts the flower class using the IRIS dataset (https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html)
+
+## Running Instructions
+- Create a fork of the repo using the `fork` button.
+- Clone your fork using `git clone https://www.github.com/techenik/mlops-iris.git`
+- Install dependencies using `pip3 install -r requirements.txt`
+- Run application using `python3 main.py`
+- Run tests using `pytest`
+
+## CI/CD
+- `build` (test) for all the pull requests
+- `build` (test) and `upload_zip` for all pushes
+
+## Assignment Tasks
+1. Change this README to add your name here: Nikhil Gopala. Add and commit changes to a new branch and create a pull request ONLY TO YOUR OWN FORK to see the CI/CD build happening. If the build succeeds, merge the pull request with master and see the CI/CD `upload_zip` take place.
+2. Add 2 more unit tests of your choice to `test_app.py` and make sure they are passing.
+3. Add one more classifier to startup and use only the one with better accuracy.
+4. Add the attribute `timestamp` to the response and return the current time with it. 
diff --git a/main.py b/main.py
@@ -1,36 +1,63 @@
 import uvicorn
 from fastapi import FastAPI
 from pydantic import BaseModel
-from ml_utils import load_model, predict
+from ml_utils import load_model, predict, retrain
+from typing import List
+from datetime import date, datetime
 
-app = FastAPI(
-    title="Iris Predictor",
-    docs_url="/"
-)
+# defining the main app
+app = FastAPI(title="Iris Predictor", docs_url="/")
 
+# calling the load_model during startup.
+# this will train the model and keep it loaded for prediction.
 app.add_event_handler("startup", load_model)
 
+# class which is expected in the payload
 class QueryIn(BaseModel):
     sepal_length: float
     sepal_width: float
     petal_length: float
     petal_width: float
 
+
+# class which is returned in the response
 class QueryOut(BaseModel):
     flower_class: str
+    timestamp: datetime 
 
+# class which is expected in the payload while re-training
+class FeedbackIn(BaseModel):
+    sepal_length: float
+    sepal_width: float
+    petal_length: float
+    petal_width: float
+    flower_class: str
 
+# Route definitions
 @app.get("/ping")
+# Healthcheck route to ensure that the API is up and running
 def ping():
-    return {"ping": "pong"}
+    return {"ping": "pong", "timestamp": datetime.now().strftime("%b %d %Y %H:%M:%S")}
 
 
 @app.post("/predict_flower", response_model=QueryOut, status_code=200)
-def predict_flower(
-    query_data: QueryIn
-):
-    output = {'flower_class': predict(query_data)}
+# Route to do the prediction using the ML model defined.
+# Payload: QueryIn containing the parameters
+# Response: QueryOut containing the flower_class predicted (200)
+def predict_flower(query_data: QueryIn):
+    output = {"flower_class": predict(query_data), "timestamp": datetime.now()}
     return output
 
+@app.post("/feedback_loop", status_code=200)
+# Route to further train the model based on user input in form of feedback loop
+# Payload: FeedbackIn containing the parameters and correct flower class
+# Response: Dict with detail confirming success (200)
+def feedback_loop(data: List[FeedbackIn]):
+    retrain(data)
+    return {"detail": "Feedback loop successful", "timestamp": datetime.now().strftime("%b %d %Y %H:%M:%S")}
+
+
+# Main function to start the app when main.py is called
 if __name__ == "__main__":
-    uvicorn.run("main:app", host='0.0.0.0', port=8888, reload=True)
+    # Uvicorn is used to run the server and listen for incoming API requests on 0.0.0.0:8888
+    uvicorn.run("main:app", host="0.0.0.0", port=8888, reload=True)
diff --git a/ml_utils.py b/ml_utils.py
@@ -1,31 +1,62 @@
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score
 
-clf = GaussianNB()
+# define a Gaussain NB classifier
+gnb = GaussianNB()
 
-classes = {
-    0: "Iris Setosa",
-    1: "Iris Versicolour",
-    2: "Iris Virginica"
-}
+# task 3
+# define a Random Forest Classifier
+rfc = RandomForestClassifier(max_depth=4, random_state=0)
 
+best_clf = gnb
+
+# define the class encodings and reverse encodings
+classes = {0: "Iris Setosa", 1: "Iris Versicolour", 2: "Iris Virginica"}
+r_classes = {y: x for x, y in classes.items()}
+
+# function to train and load the model during startup
 def load_model():
-	X, y = datasets.load_iris(return_X_y=True)
+    # load the dataset from the official sklearn datasets
+    X, y = datasets.load_iris(return_X_y=True)
 
-	X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
-	clf.fit(X_train, y_train)
+    # do the test-train split and train the model
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+    gnb.fit(X_train, y_train)
 
-	acc = accuracy_score(y_test, clf.predict(X_test))
-	print(f"Model trained with accuracy: {round(acc, 3)}")
+    # task 3 - training second classifier
+    rfc.fit(X_train, y_train)
 
-def predict(query_data):
-	x = list(query_data.dict().values())
-	prediction = clf.predict([x])[0] 
-	print(f"Model prediction: {classes[prediction]}")
-	return classes[prediction]
+    # calculate the print the accuracy score of Gaussain NB
+    acc_gnb = accuracy_score(y_test, gnb.predict(X_test))
+    print(f"Gaussain NB Model trained with accuracy: {round(acc_gnb, 3)}")
 
+    # task 3 - calculate the print the accuracy score of Random Forest Classifier
+    acc_rfc = accuracy_score(y_test, rfc.predict(X_test))
+    print(f"Random Forest Classifier Model trained with accuracy: {round(acc_rfc, 3)}")
 
+    if acc_rfc > acc_gnb:
+        best_clf = rfc
+        print("Random Forest Classifier has better accuracy than Gaussain NB")
+    else:
+        best_clf = gnb
+        print("Gaussain NB has better accuracy than Random Forest Classifier")
 
 
+# function to predict the flower using the model
+def predict(query_data):
+    x = list(query_data.dict().values())
+    prediction = best_clf.predict([x])[0]
+    print(f"Model prediction: {classes[prediction]}")
+    return classes[prediction]
+
+# function to retrain the model as part of the feedback loop
+def retrain(data):
+    # pull out the relevant X and y from the FeedbackIn object
+    X = [list(d.dict().values())[:-1] for d in data]
+    y = [r_classes[d.flower_class] for d in data]
+
+    # fit the classifier again based on the new data obtained
+    best_clf.fit(X, y)
diff --git a/test_app.py b/test_app.py
@@ -1,21 +1,58 @@
 from fastapi.testclient import TestClient
 from main import app
 
-
+# test to check the correct functioning of the /ping route
 def test_ping():
     with TestClient(app) as client:
         response = client.get("/ping")
+        # asserting the correct response is received
         assert response.status_code == 200
-        assert response.json() == {"ping":"pong"}
+        assert response.json()["ping"] == "pong"
+
 
+# test to check if Iris Virginica is classified correctly
 def test_pred_virginica():
+    # defining a sample payload for the testcase
+    payload = {
+        "sepal_length": 3,
+        "sepal_width": 5,
+        "petal_length": 3.2,
+        "petal_width": 4.4,
+    }
+    with TestClient(app) as client:
+        response = client.post("/predict_flower", json=payload)
+        # asserting the correct response is received
+        assert response.status_code == 200
+        assert response.json()["flower_class"] == "Iris Virginica"
+
+# task 2
+
+# test to check if Iris Setosa is classified correctly
+def test_pred_setosa():
+    # defining a sample payload for the testcase
+    payload = {
+        "sepal_length": 5.1,
+        "sepal_width": 3.5,
+        "petal_length": 1.4,
+        "petal_width": 0.2,
+    }
+    with TestClient(app) as client:
+        response = client.post("/predict_flower", json=payload)
+        # asserting the correct response is received
+        assert response.status_code == 200
+        assert response.json()["flower_class"] == "Iris Setosa"
+
+# test to check if Iris Versicolour is classified correctly
+def test_pred_versicolour():
+    # defining a sample payload for the testcase
     payload = {
-      "sepal_length": 3,
-      "sepal_width": 5,
-      "petal_length": 3.2,
-      "petal_width": 4.4
+        "sepal_length": 7,
+        "sepal_width": 3.2,
+        "petal_length": 4.7,
+        "petal_width": 1.4,
     }
     with TestClient(app) as client:
-        response = client.post('/predict_flower', json=payload)
+        response = client.post("/predict_flower", json=payload)
+        # asserting the correct response is received
         assert response.status_code == 200
-        assert response.json() == {'flower_class': "Iris Virginica"}
+        assert response.json()["flower_class"] == "Iris Versicolour"