Staging (#107)

* Merged kes-python-flask-app and file-handling branches * Add Kaggle API, File parsing, Calculations, and GeneratePDF Deployed on Heroku from Elizabeth's fork to get the 'analyze the dataset' button to run and return a report in analyse.html. Now, we are merging into Kes's branch to get the analyse.html page working on the heroku app that is meant to be for the project. * Update Installation instructions * Delete data.csv, Add report folder and kaggle info Need to have .kaggle file for the app to have access to the kaggle api Co-authored-by: “Kes <“[email protected]”> Co-authored-by: sakshigupta265 <[email protected]> Co-authored-by: kes cardoso <[email protected]> Co-authored-by: wyang0216 <[email protected]> Co-authored-by: sakshigupta265 <[email protected]>
kescardoso · Apr 13, 2021 · 800c3e6 · 800c3e6
1 parent c67a362
commit 800c3e6
Show file tree

Hide file tree

Showing 24 changed files with 841 additions and 163 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,5 @@
 __pycache__
 venv
 env.py
-.kaggle
 .vscode
 .DS_Store
diff --git a/.kaggle/kaggle.json b/.kaggle/kaggle.json
@@ -0,0 +1 @@
+{"username":"elizabethcrouther","key":"7a5c2300510b71d5ce544950879b0817"}
diff --git a/ROI.py b/ROI.py
@@ -0,0 +1,10 @@
+import cv2
+
+def region_of_interest(faces, img_copy):
+
+    for face in faces:
+        x,y,w,h = face
+        offset = 0
+        face_section = img_copy[y-offset:y+h+offset, x-offset:x+w+offset]
+
+    return face_section
diff --git a/app.py b/app.py
@@ -12,7 +12,8 @@
 
 # only importing this function prevents 
 # the whole .py file from executing on startup
-from runTerminalCommands import startCommands 
+
+from runTerminalCommands import openFiles, findReadableFiles 
 
 if os.path.exists("env.py"):
     import env
@@ -127,20 +128,32 @@ def analyse_data():
         if fileString is not None:
             split_filename = fileString.split('.com/')
             fileString = split_filename[1]
-            reportMade = startCommands(fileString)
-            # With open('/Users/mac/IdeaProjects/datasetbucket/report.pdf', 'rb') 
-            # as static_file """
-            if reportMade:
+
+            targetDataPath = os.path.join('https://github.com/eliboss/datasetbucket/raw/main/dataFiles', fileString)
+
+            time.sleep(6)
+            targetReportPath = os.path.join('https://github.com/eliboss/datasetbucket/raw/main/reportdir', 'report.pdf')
+            # reportMade = 
+            # print('report made: ', reportMade)
+            #print(targetReportPath)
+
+            reportMade, reportName = openFiles(fileString, targetDataPath, targetReportPath)
+            if reportMade is not None:
+                print('report made: ', reportMade)
+                print('report name: ', reportName)
                 time.sleep(5)
+                #reportName = reportName+'.pdf'
+                #reportPath = os.path.join(reportMade, reportName)
+
                 try:
-                    return send_file('/Users/mac/IdeaProjects/datasetbucket/report.pdf', 
-                                      as_attachment=True)
+                    return send_file(reportMade, as_attachment=True)
                 except:
                     return render_template("analyse.html", 
-                                            dataToRender="Unable able to generate report")
+                                            dataToRender="Unable to generate report")
             else:
                 return render_template("analyse.html", 
-                                        dataToRender="Unable able to generate report")
+                                        dataToRender="Unable to generate report")
+
     return render_template("analyse.html")
 
 

diff --git a/calculations.py b/calculations.py
@@ -2,6 +2,10 @@
 from statistics import mean, variance
 from sklearn.linear_model import LinearRegression
 
+import matplotlib
+import matplotlib.pyplot as plt
+
+
 # calculate the mean of an array of data
 def calcMean(data):
     if data is None: return None
@@ -32,7 +36,7 @@ def calcLinearReg(npArray):
 
 # calculates how many unique values are in an array
 # TODO: make it work for a dict too
-def calcUniquieValues(data):
+def calcUniqueValues(data):
     if data is None:
         return
     array = np.array(data)
@@ -42,4 +46,43 @@ def calcUniquieValues(data):
         if d not in temparray:
             temparray.append(d)
             count += 1
-    return count
+    return count
+
+# calculates the number of samples for each value
+def calcBreakDown(data):
+    sent = "You have "
+    (unique, counts) = np.unique(data, return_counts=True)
+    freq = np.asarray((unique, counts))
+
+    freq = freq.T.astype(int)
+    count = 0
+    for i in freq:
+        sent += str(i[1]) + " samples labelled as " + str(i[0])
+        if count != len(freq) - 1:
+            count += 1
+            sent = sent + " and "
+        else:
+            sent = sent + "."
+
+    return sent
+
+# displays a histogram for a feature
+def calcHistogram(data, category):
+    title = category + " Histogram"
+    file_name = title + '.png'
+    num_bin = 0
+    num_unique_val = calcUniqueValues(data)
+    if num_unique_val < 10:
+        num_bin = num_unique_val
+    else:
+        num_bin = 10
+    matplotlib.use('agg')
+    plt.hist(data, bins=num_bin)
+    plt.title(title)
+    plt.xlabel(category)
+    plt.ylabel("Count")
+    plt.savefig(file_name)
+    plt.close()
+
+    return file_name
+
diff --git a/dataFiles/do_not_delete.txt b/dataFiles/do_not_delete.txt
@@ -0,0 +1,5 @@
+f_name,l_name
+Sakshi,Gupta
+Elizabeth,Crouther
+Kes,Cardoso
+William,Yang
diff --git a/detectFace.py b/detectFace.py
@@ -0,0 +1,58 @@
+import cv2
+
+def draw_found_faces(detected, image, color: tuple):
+    for (x, y, width, height) in detected:
+        cv2.rectangle(
+            image,
+            (x, y),
+            (x + width, y + height),
+            color,
+            thickness=2
+        )
+
+def detect_faces(img_path):
+    # creating haar cascade classifier
+
+    faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_alt.xml")
+
+    # not in use
+    profileCascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_profileface.xml")
+
+    # reading image
+    img = cv2.imread(img_path)
+
+    # reducing the size of image to a standard 256x256 image
+    img = cv2.resize(img,(256,256))
+    img_copy = img.copy()
+
+    # converting to gray scale face (makes detection easier :D)
+    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Detect faces in the image
+    front_faces = faceCascade.detectMultiScale(
+        gray_img,
+        scaleFactor=1.3,
+        minNeighbors=5,
+    )
+
+    profile_faces = profileCascade.detectMultiScale(
+        gray_img,
+        scaleFactor=1.3,
+        minNeighbors=5,
+    )
+
+    # Filter out profiles
+    # profiles_not_front_faces = [x for x in profile_faces if x not in front_faces]
+
+    # Draw rectangles around faces on the original, colored image
+    draw_found_faces(front_faces, img, (0, 255, 0)) # RGB - green
+    # draw_found_faces(profile_faces, img, (0, 0, 255)) # RGB - red
+
+    # showing image + rectangle
+    # cv2.imshow('image',img)
+
+    #Wait for any key before image disappears
+    # cv2.waitKey(0)
+    # cv2.destroyAllWindows()
+
+    return img_copy,front_faces
diff --git a/extractDominantColor.py b/extractDominantColor.py
@@ -0,0 +1,33 @@
+import cv2
+import getColorInformation
+from sklearn.cluster import KMeans
+import warnings
+
+
+def extractDominantColor(image,number_of_colors,hasThresholding=False):
+
+  # Quick Fix Increase cluster counter to neglect the black(Read Article) 
+  if hasThresholding == True:
+    number_of_colors +=1
+
+  # Taking Copy of the image
+  img = image.copy()
+
+  # Convert Image into RGB Colours Space
+  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
+
+  # Reshape Image
+  img = img.reshape((img.shape[0]*img.shape[1]) , 3)
+
+  #Initiate KMeans Object
+  estimator = KMeans(n_clusters=number_of_colors, random_state=0)
+
+  # Fit the image
+  with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    # cluster_data(data_arr)
+    estimator.fit(img)
+
+  # Get Colour Information
+  colorInformation = getColorInformation.getColorInformation(estimator.labels_,estimator.cluster_centers_,hasThresholding)
+  return colorInformation
diff --git a/extractSkin.py b/extractSkin.py
@@ -0,0 +1,57 @@
+import cv2
+import numpy as np
+
+def extractSkin(image):
+  # Taking a copy of the image
+  img =  image.copy()
+  # Converting from BGR Colours Space to HSV
+  img =  cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
+
+  # Defining HSV Threadholds
+  lower_threshold = np.array([0, 48, 80], dtype=np.uint8)
+  upper_threshold = np.array([20, 255, 255], dtype=np.uint8)
+
+  # Single Channel mask,denoting presence of colours in the about threshold
+  skinMask = cv2.inRange(img,lower_threshold,upper_threshold)
+
+  # Cleaning up mask using Gaussian Filter
+  skinMask = cv2.GaussianBlur(skinMask,(3,3),0)
+
+  # Extracting skin from the threshold mask
+  skin  =  cv2.bitwise_and(img,img,mask=skinMask)
+
+  # Return the Skin image
+  return cv2.cvtColor(skin,cv2.COLOR_HSV2BGR)
+
+def removeBlack(estimator_labels, estimator_cluster):
+
+
+  # Check for black
+  hasBlack = False
+
+  # Get the total number of occurance for each color
+  occurance_counter = Counter(estimator_labels)
+
+
+  # Quick lambda function to compare to lists
+  compare = lambda x, y: Counter(x) == Counter(y)
+
+  # Loop through the most common occuring color
+  for x in occurance_counter.most_common(len(estimator_cluster)):
+
+    # Quick List comprehension to convert each of RBG Numbers to int
+    color = [int(i) for i in estimator_cluster[x[0]].tolist() ]
+
+
+
+    # Check if the color is [0,0,0] that if it is black 
+    if compare(color , [0,0,0]) == True:
+      # delete the occurance
+      del occurance_counter[x[0]]
+      # remove the cluster 
+      hasBlack = True
+      estimator_cluster = np.delete(estimator_cluster,x[0],0)
+      break
+
+
+  return (occurance_counter,estimator_cluster,hasBlack)
diff --git a/filePath.py b/filePath.py
@@ -0,0 +1,40 @@
+import glob
+import os
+import cv2
+import main
+import sys
+
+def getPath(folder):
+
+    if sys.platform.startswith('darwin') or sys.platform.startswith('linux') : # - elizabeth - my mac wasn't recognizing the regex in the elif 'win32' code
+        my_path = os.getcwd()+'/dataFiles/'+folder+'/'
+        files_jpg = glob.glob(my_path + '*.jpg' , recursive=True)
+        files_jpeg = glob.glob(my_path + '*.jpeg' , recursive=True)
+        files_png = glob.glob(my_path + '*.png' , recursive=True)
+
+    elif sys.platform.startswith('win32'):
+        my_path = os.getcwd()
+        files_jpg = glob.glob(my_path + '\\dataFiles\\**\\*.jpg' , recursive=True)
+        files_jpeg = glob.glob(my_path + '\\dataFiles\\**\\*.jpeg' , recursive=True)
+        files_png = glob.glob(my_path + '\\dataFiles\\**\\*.png' , recursive=True)
+
+    files = files_jpeg + files_jpg + files_png
+    l = len(files)
+    # files = sorted(files)
+
+    # print(files)
+    # print(len(files))
+
+    return files, l
+
+# For debugging without running the whole app
+
+# my_path = os.getcwd()
+# files_jpg = glob.glob(my_path + '\\dataFiles\\**\\*.jpg' , recursive=True)
+# files_jpeg = glob.glob(my_path + '\\dataFiles\\**\\*.jpeg' , recursive=True)
+# files_png = glob.glob(my_path + '\\dataFiles\\**\\*.png' , recursive=True)
+
+# files = files_jpeg + files_jpg + files_png
+# l = len(files)
+
+# main.readImage(files, l)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"username":"elizabethcrouther","key":"7a5c2300510b71d5ce544950879b0817"}