Update Dockerfile, fix imports, and lint docstrings. (#9)

* Drop .idea from project and add to .gitignore. * Remove redundant build steps, rely on base image instead. * Replace opencv-python with opencv-python-headless. Update version to drop Python2 support. Fixes #4. * Upgrade pandas to v1. * Update docstring formatting. * Another docstring update. * Sort imports with isort and remove unused imports. * Ignore training.py from coverage report. No need to report on this, it is untested * Markdown linting. * More docstring updates. * Final docstring updates. * PEP8 * opencv-headless does not need so many deps. * Import DBSCAN from sklearn.
vanvalenlab · Dec 8, 2021 · 52338fb · 52338fb
1 parent bd421f2
commit 52338fb
Show file tree

Hide file tree

Showing 38 changed files with 725 additions and 663 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -15,3 +15,4 @@ show_missing = True
 omit =
     **/*_test.py
     data/*
+    deepcell_spots/training.py
diff --git a/.gitignore b/.gitignore
@@ -107,6 +107,7 @@ venv.bak/
 
 # IDE files and folders
 .vscode/
+.idea/
 
 # OS generated files
 .DS_Store
@@ -120,4 +121,4 @@ Thumbs.db
 # Data files
 *.h5
 *.trk
-*.trks
+*.trks
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/deepcell-spots.iml b/.idea/deepcell-spots.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/Dockerfile b/Dockerfile
@@ -1,32 +1,24 @@
-# Use tensorflow/tensorflow as the base image
+# Use vanvalenlab/deepcell-tf as the base image
 # Change the build arg to edit the tensorflow version.
 # Only supporting python3.
-ARG DEEPCELL_VERSION=0.11.0
+ARG DEEPCELL_VERSION=0.11.0-gpu
 
-FROM deepcell:${DEEPCELL_VERSION}
+FROM vanvalenlab/deepcell-tf:${DEEPCELL_VERSION}
 
-# System maintenance
-RUN /usr/bin/python3 -m pip install --upgrade pip
-
-# installs git into the Docker image
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install git -y
-RUN apt-get install ffmpeg libsm6 libxext6  -y
-
-WORKDIR /notebooks
+# Install git for postcode installation
+RUN apt-get update && apt-get install -y \
+    git && \
+    rm -rf /var/lib/apt/lists/*
 
 # Copy the required setup files and install the deepcell-tf dependencies
 COPY setup.py README.md requirements.txt /opt/deepcell-spots/
 
 # Prevent reinstallation of tensorflow and install all other requirements.
 RUN sed -i "/tensorflow>/d" /opt/deepcell-spots/requirements.txt && \
-    pip install -r /opt/deepcell-spots/requirements.txt
+    pip install --no-cache-dir -r /opt/deepcell-spots/requirements.txt
 
 # Copy the rest of the package code and its scripts
 COPY deepcell_spots /opt/deepcell-spots/deepcell_spots
 
 # Copy over deepcell notebooks
 COPY notebooks/ /notebooks/
-
-CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--allow-root"]
diff --git a/README.md b/README.md
@@ -5,8 +5,8 @@
 
 `deepcell-spots` is a deep learning library for fluorescent spot detection image analysis. It allows you to apply pre-existing models and train new deep learning models for spot detection. It is written in Python and built using [TensorFlow](https://github.com/tensorflow/tensorflow), [Keras](https://www.tensorflow.org/guide/keras) and [DeepCell](https://github.com/vanvalenlab/deepcell-tf).
 
-
 ## DeepCell-spots for Developers
+
 Build and run a local docker container, similarly to the instructions for deepcell-tf. The relevant parts are copied here with modifications to work for deepcell-spots. For more elaborate instructions, see the [deepcell-tf README](https://github.com/vanvalenlab/deepcell-tf/blob/master/README.md).
 
 ### Build a local docker container, specifying the tensorflow version with TF_VERSION

diff --git a/deepcell_spots/applications/spot_detection.py b/deepcell_spots/applications/spot_detection.py
@@ -27,17 +27,18 @@
 
 from __future__ import absolute_import, division, print_function
 
-import glob
 import os
 import timeit
 
 import tensorflow as tf
 from deepcell.applications import Application
+
 from deepcell_spots.dotnet import *
 from deepcell_spots.dotnet_losses import DotNetLosses
 from deepcell_spots.postprocessing_utils import y_annotations_to_point_list_max
 from deepcell_spots.preprocessing_utils import min_max_normalize
 
+
 MODEL_PATH = ('https://deepcell-data.s3-us-west-1.amazonaws.com/'
               'saved-models/SpotDetection-3.tar.gz')
 

diff --git a/deepcell_spots/cluster_vis.py b/deepcell_spots/cluster_vis.py
@@ -29,7 +29,6 @@
 from itertools import combinations
 
 import numpy as np
-from scipy.spatial import distance
 
 
 def jitter(coords, size):
@@ -79,16 +78,15 @@ def label_graph_ann(G, coords, exclude_last=False):
     """Labels the annotator associated with each node in the graph
 
     Args:
-        G (networkx graph): Graph with edges indicating clusters of points assumed to be derived
-        from the same ground truth detection
-    coords : matrix
-        2d-array of detected point locations for each classical algorithm used
-    exclude_last : boolean
-        Only set as True to exclude a point that has been included for the purpose of normalization
+        G (networkx.Graph): Graph with edges indicating clusters of points
+        assumed to be derived from the same ground truth detection
+    coords (numpy.array): 2d-array of detected point locations for each
+        classical algorithm used
+    exclude_last (bool): Only set as True to exclude a point that has been
+        included for the purpose of normalization
 
     Returns:
-        G_new (networkx graph): Labeled graph
-
+        networkx.Graph: Labeled graph
     """
 
     G_new = G.copy()
@@ -114,20 +112,21 @@ def label_graph_ann(G, coords, exclude_last=False):
 
 def label_graph_gt(G, detection_data, gt):
 
-    """Labels the ground truth identity of each node in the graph -- intended for simulated data
+    """Labels the ground truth identity of each node in the graph.
 
-    Args:
-        G (networkx graph): Graph with edges indicating clusters of points assumed to be derived
-            from the same ground truth detection
-        detection_data (matrix): Matrix with dimensions (number of clusters) x (number of
-            algorithms) with value of 1 if an algorithm detected
-            the cluster and 0 if it did not
-        gt (array): Array with length (number of cluster) with value of 1 if cluster is a true
-            positive detection and 0 if it is a false positive
+    Intended for simulated data.
 
+    Args:
+        G (networkx graph): Graph with edges indicating clusters of points
+            assumed to be derived from the same ground truth detection
+        detection_data (numpy.array): Matrix with dimensions (number of clusters) x
+            (number of algorithms) with value of 1 if an algorithm detected
+            the cluster and 0 if it did not.
+        gt (numpy.array): Array with length (number of cluster) with value of 1 if
+            cluster is a true positive detection and 0 if it is a false positive.
 
     Returns:
-        G_new (networkx graph): Labeled graph
+        networkx.Graph: Labeled graph
     """
 
     G_new = G.copy()
@@ -155,22 +154,22 @@ def label_graph_gt(G, detection_data, gt):
 
 def label_graph_prob(G, detection_data, p_matrix):
 
-    """Labels the EM output probability of being a ground truth true detection for each cluster in
-    the graph
+    """Labels the EM output probability of being a ground truth true detection
+    for each cluster in the graph.
 
     Args:
-        G (networkx graph): Graph with edges indicating clusters of points assumed to be derived
-            from the same ground truth detection
-        detection_data (matrix): Matrix with dimensions (number of clusters) x (number of
-            algorithms) with value of 1 if an algorithm detected
-            the cluster and 0 if it did not
-        p_matrix (matrix): Matrix with dimensions (number of clusters) x 2 where first column is
-        the probability that a cluster is a true positive and second column is the probability that
-        it is a false positive
-
+        G (networkx.Graph): Graph with edges indicating clusters of points
+            assumed to be derived from the same ground truth detection
+        detection_data (numpy.array): Matrix with dimensions (number of
+            clusters) x (number of algorithms) with value of 1 if an algorithm
+            detected the cluster and 0 if it did not.
+        p_matrix (matrix): Matrix with dimensions (number of clusters) x 2
+            where first column is the probability that a cluster is a true
+            positive and second column is the probability that it is a
+            false positive.
 
     Returns:
-        G_new (networkx graph): Labeled graph
+        networkx.Graph: Labeled graph
     """
     G_new = G.copy()
 

diff --git a/deepcell_spots/cluster_vis_test.py b/deepcell_spots/cluster_vis_test.py
@@ -29,10 +29,11 @@
 from itertools import combinations
 
 import numpy as np
-from deepcell_spots.cluster_vis import ca_to_adjacency_matrix, jitter
 from scipy.spatial import distance
 from tensorflow.python.platform import test
 
+from deepcell_spots.cluster_vis import ca_to_adjacency_matrix, jitter
+
 
 class TestClusterVis(test.TestCase):
     def test_jitter(self):

diff --git a/deepcell_spots/data_utils.py b/deepcell_spots/data_utils.py
@@ -32,23 +32,24 @@
 
 
 def slice_image(X, reshape_size, overlap=0):
-    '''
-    Slice images in X into smaller parts. similar to deepcell.utils.data_utils reshape_matrix
+    """Slice images in X into smaller parts.
+
+    Similar to ``deepcell.utils.data_utils.reshape_matrix``.
 
     Args:
         X (np.array) containing images: has size (img_number, y, x, channel)
         reshape_size: list of 2 values: y_size, x_size
-        overlap (int): number of pixels overlapping in each row/column with the pixels from the
-                same row/column in the neighboring slice
+        overlap (int): number of pixels overlapping in each row/column with
+            the pixels from the same row/column in the neighboring slice
 
     Returns:
-        new_X: stack of reshaped images in order of small to large y, then small to large x
-                position in the original image
-        np.array of size (n*img_number, y_size, x_size, channel)
-        where n = number of images each image in X was sliced into
-        if the original image lengths aren't divisible by y_size, x_size, the last image in each
-        row / column overlaps with the one before
-    '''
+        numpy.array: Stack of reshaped images in order of small to large y,
+            then small to large x position in the original image
+            np.array of size (n*img_number, y_size, x_size, channel)
+            where n = number of images each image in X was sliced into
+            if the original image lengths aren't divisible by y_size, x_size,
+            the last image in each row / column overlaps with the one before.
+    """
     image_size_x = X.shape[1]
     image_size_y = X.shape[2]
 
@@ -90,27 +91,28 @@ def slice_image(X, reshape_size, overlap=0):
 
 
 def slice_annotated_image(X, y, reshape_size, overlap=0):
-    '''
-    Slice images in X into smaller parts. similar to deepcell.utils.data_utils reshape_matrix
+    """Slice images in X into smaller parts.
+
+    Similar to ``deepcell.utils.data_utils.reshape_matrix``
 
     Args:
         X (np.array) containing images: has shape (img_number, y, x, channel)
         reshape_size: list of 2 values: y_size, x_size
-        overlap (int): number of pixels overlapping in each row/column with the pixels from the
-        same row/column in the neighboring slice
-        y (list / np.array) containing coordinate annotations: has length (img_number),
-        each element of the list is a (N,2) np.array where N=the number of points in the image
+        overlap (int): number of pixels overlapping in each row/column with the
+            pixels from the same row/column in the neighboring slice
+        y (list / np.array): Contains coordinate annotations.
+            Has length (img_number), each element of the list is a (N, 2)
+            np.array where N=the number of points in the image.
 
     Returns:
-        new_X: stack of reshaped images in order of small to large y, then small to large x
-        position in the original image
-        np.array of size (n*img_number, y_size, x_size, channel)
-        where n = number of images each image in X was sliced into
-        if the original image lengths aren't divisible by y_size, x_size, the last image in each
-        row / column overlaps with the one before
-
-        new_y: list of length n*img_number
-    '''
+        numpy.array: Stack of reshaped images in order of small to large y,
+            then small to large x position in the original image np.array
+            of size (n*img_number, y_size, x_size, channel) where n = number
+            of images each image in X was sliced into if the original image
+            lengths aren't divisible by y_size, x_size, the last image in
+            each row / column overlaps with the one before
+        list: list of length n*img_number
+    """
     image_size_y = X.shape[1]
     image_size_x = X.shape[2]
 
@@ -163,14 +165,15 @@ def slice_annotated_image(X, y, reshape_size, overlap=0):
 
 def get_data(file_name, test_size=.2, seed=0, allow_pickle=False):
     """Load data from NPZ file and split into train and test sets
-    This is a copy of deepcell's utils.data_utils.get_data, with allow_pickle added and mode removed
+    This is a copy of ``deepcell.utils.data_utils.get_data``,
+    with allow_pickle added and mode removed.
 
     Args:
         file_name (str): path to NPZ file to load
         test_size (float): percent of data to leave as testing holdout
         seed (int): seed number for random train/test split repeatability
-        allow_pickle (bool): if True, allow loading pickled object arrays stored in npz files
-        (via numpy.load)
+        allow_pickle (bool): if True, allow loading pickled object arrays
+            stored in npz files (via numpy.load).
 
     Returns:
         (dict, dict): dict of training data, and a dict of testing data

diff --git a/deepcell_spots/data_utils_test.py b/deepcell_spots/data_utils_test.py
@@ -29,12 +29,13 @@
 import os
 
 import numpy as np
-from deepcell_spots.data_utils import (get_data, slice_annotated_image,
-                                       slice_image)
 from sklearn.model_selection import train_test_split
 from tensorflow.python.keras import backend as K
 from tensorflow.python.platform import test
 
+from deepcell_spots.data_utils import (get_data, slice_annotated_image,
+                                       slice_image)
+
 
 class TestDataUtils(test.TestCase):
     def test_slice_image(self):
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,3 +15,4 @@ show_missing = True @@
     omit =
         **/*_test.py
         data/*
+        deepcell_spots/training.py