From ea8b023e90006ffa6075d4410f2ecaa6f852beb6 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <thibault.tabarin@gmail.com>
Date: Thu, 29 Sep 2022 11:10:20 -0600
Subject: [PATCH 01/28] reverse to drexel version with minor modification for
 BGNN minowws project

---
 config/config.json    |   3 +-
 dataverse_download.py |  71 +++++++++++
 gen_metadata.py       | 271 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 296 insertions(+), 49 deletions(-)
 create mode 100644 dataverse_download.py
diff --git a/config/config.json b/config/config.json
index 8b8406f..757ecd6 100644
--- a/config/config.json
+++ b/config/config.json
@@ -1,4 +1,5 @@
 {
   "ENHANCE": 1,
-  "JOEL": 0
+  "PROCESSOR": "cpu",
+  "Version":"drexel"
 }
diff --git a/dataverse_download.py b/dataverse_download.py
new file mode 100644
index 0000000..3f18151
--- /dev/null
+++ b/dataverse_download.py
@@ -0,0 +1,71 @@
+#!/usr/local/bin/python
+# Script to download a dataset from a Dataverse (https://dataverse.org/)
+#' @author
+#' John Bradley: initial code
+#' Thibault Tabarin: small modification
+#' 
+
+
+import os
+import sys
+import hashlib
+from pyDataverse.api import NativeApi, DataAccessApi
+
+
+def download_dataset(base_url, api_token, doi, directory_output):
+    api = NativeApi(base_url, api_token)
+    data_api = DataAccessApi(base_url, api_token)
+    dataset = api.get_dataset(doi)
+    files_list = dataset.json()['data']['latestVersion']['files']
+    for dv_file in files_list:
+        filepath = download_file(data_api, dv_file, directory_output)
+        verify_checksum(dv_file, filepath)
+
+
+def download_file(data_api, dv_file, directory_output):
+    filepath = dv_file["dataFile"]["filename"]
+    #directory_label = dv_file["directoryLabel"]
+    os.makedirs(directory_output, exist_ok=True)
+    filepath = os.path.join(directory_output, filepath)
+    file_id = dv_file["dataFile"]["id"]
+    print("Downloading file {}, id {}".format(filepath, file_id))
+    response = data_api.get_datafile(file_id)
+    with open(filepath, "wb") as f:
+        f.write(response.content)
+    return filepath
+
+
+def verify_checksum(dv_file, filepath):
+    checksum = dv_file["dataFile"]["checksum"]
+    checksum_type = checksum["type"]
+    checksum_value = checksum["value"]
+    if checksum_type != "MD5":
+        raise ValueError(f"Unsupported checksum type {checksum_type}")
+
+    with open(filepath, 'rb') as infile:
+        hash = hashlib.md5(infile.read()).hexdigest()
+        if checksum_value == hash:
+            print("Verified file checksum for {filepath}.")
+        else:
+            raise ValueError(f"Hash value mismatch for {filepath}: {checksum_value} vs {hash} ")
+
+
+def show_usage():
+   print()
+   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi> <directory_output>\n")
+   print("To specify a API token set the DATAVERSE_API_TOKEN environment variable.\n")
+   print("To set the environment variable : export DATAVERSE_API_TOKEN=<my_token>")
+   print()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+         show_usage()
+         sys.exit(1)
+    else:
+        BASE_URL = sys.argv[1]
+        DOI = sys.argv[2]
+        directory_output = sys.argv[3]
+        API_TOKEN = os.environ.get('DATAVERSE_API_TOKEN')
+        #print(API_TOKEN)
+        download_dataset(BASE_URL, API_TOKEN, DOI, directory_output)
diff --git a/gen_metadata.py b/gen_metadata.py
index 67b76bb..9ea89d9 100644
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -1,3 +1,24 @@
+#!/usr/local/bin/python
+#' Adapatation for BGNN snakemake (minnows project)
+#  on Mon Aug  8 11:35:50 2022
+#' @author
+#' Joel Pepper: initial code
+#' Kevin Karnani: modified it
+#' Thibault Tabarin: modify for minnow project
+
+#' @description
+#' Minnows version original code developped by Joel Pepper and Kevin Karnani'
+#' Using the detectron2 framework from Facebook, we detect fish, eye and ruler object and
+#' collect metadata information such bounding box, fish mask, eye center, fish orientation...
+#' The added version of the ouput are
+#' Dictionnary with metadata:  {"base_name": "", "fish":
+#' {"fish_num": , "bbox": [], "pixel_analysis": true, "eye_bbox": [], "eye_center": [],
+#' "angle_degree": 9.070674226380035, "eye_direction": "left", "foreground_mean": ,
+#' "foreground_std": , "background_mean": , "background_std": },
+#' "ruler": {"bbox": [], "scale": , "unit": ""}}
+#' And fish mask (binary image)
+
+
 import json
 import math
 import os
@@ -20,35 +41,41 @@
 from skimage import filters, measure
 from skimage.morphology import flood_fill
 from torch.multiprocessing import Pool
-
+import warnings
+warnings.filterwarnings("ignore")
 # torch.multiprocessing.set_start_method('forkserver')
 
+# ensure the look at the right place for the configuration file
+root_file_path = os.path.dirname(__file__)
+
 VAL_SCALE_FAC = 0.5
-conf = json.load(open('config/config.json', 'r'))
+conf = json.load(open(os.path.join(root_file_path,'config/config.json'), 'r'))
 ENHANCE = bool(conf['ENHANCE'])
-JOEL = bool(conf['JOEL'])
+PROCESSOR = conf['PROCESSOR']
+VERSION = conf['Version'] # option changeable in the config file : "drexel" or "bgnn"
 IOU_PCT = .02
 
 with open('config/mask_rcnn_R_50_FPN_3x.yaml', 'r') as f:
     iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
 
 
-def init_model(enhance_contrast=ENHANCE, joel=JOEL):
+def init_model(processor=PROCESSOR):
     """
     Initialize model using config files for RCNN, the trained weights, and other parameters.
 
     Returns:
         predictor -- DefaultPredictor(**configs).
     """
+    root_file_path = os.path.dirname(__file__)
     cfg = get_cfg()
-    cfg.merge_from_file("config/mask_rcnn_R_50_FPN_3x.yaml")
+    cfg.merge_from_file(os.path.join(root_file_path,'config/mask_rcnn_R_50_FPN_3x.yaml'))
     cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
-    if not joel:
-        cfg.OUTPUT_DIR += f"/non_enhanced" if not enhance_contrast else f"/enhanced"
-        # cfg.OUTPUT_DIR += f"/non_enhanced_{iters}" if not enhance_contrast else f"/enhanced_{iters}"
+    OUTPUT_DIR = os.path.join(root_file_path, 'output')
     cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
+    cfg.MODEL.DEVICE = processor
     predictor = DefaultPredictor(cfg)
+    
     return predictor
 
 
@@ -135,14 +162,18 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     if visualize:
         cv2.imshow('prediction', np.array(vis.get_image()[:, :, ::-1], dtype=np.uint8))
         cv2.waitKey(0)
-    os.makedirs('images', exist_ok=True)
-    os.makedirs('images/enhanced', exist_ok=True)
-    os.makedirs('images/non_enhanced', exist_ok=True)
-    dirname = 'images/'
-    dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
-    print(file_name)
-    cv2.imwrite(f'{dirname}/gen_prediction_{f_name}.png',
-                vis.get_image()[:, :, ::-1])
+        
+    if False:    # to save visualization of the prediction       
+        os.makedirs('images', exist_ok=True)
+        os.makedirs('images/enhanced', exist_ok=True)
+        os.makedirs('images/non_enhanced', exist_ok=True)
+        dirname = 'images/'
+        dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
+        print(file_name)
+        cv2.imwrite(f'{dirname}/gen_prediction_{f_name}.png',
+                        vis.get_image()[:, :, ::-1])
+    
+
     skippable_fish = []
     fish_length = 0
     if fish:
@@ -191,6 +222,10 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
             val = adaptive_threshold(bbox, im_gray)
             bbox, mask, pixel_anal_failed = gen_mask(bbox, file_path,
                                                      file_name, im_gray, val, detectron_mask)
+            
+            # Convert the nask to np.uint8 to save it latter
+            mask_uint8 = np.where(mask == 1, 255, 0).astype(np.uint8)
+            
             centroid, evecs, cont_length, cont_width, length, width, area = pca(mask, scale)
             major, minor = evecs[0], evecs[1]
 
@@ -297,7 +332,7 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     results['fish_count'] = len(insts[(insts.pred_classes == 0).logical_and(insts.scores > 0.3)]) - \
                             len(skippable_fish) if multiple_fish else int(results['has_fish'])
     results['detected_fish_count'] = fish_length
-    return {f_name: results}
+    return {f_name: results}, mask_uint8
 
 
 def gen_metadata_upscale(file_path, fish):
@@ -861,17 +896,6 @@ def gen_mask(bbox, file_path, file_name, im_gray, val, detectron_mask, flipped=F
         new_mask = detectron_mask.astype('uint8')
         bbox = bbox_orig
         failed = True
-    # arr4 = np.where(new_mask == 1, 255, 0).astype(np.uint8)
-    # (left, top, right, bottom) = shrink_bbox(new_mask)
-    # arr4[top:bottom, left] = 175
-    # arr4[top:bottom, right] = 175
-    # arr4[top, left:right] = 175
-    # arr4[bottom, left:right] = 175
-    # im2 = Image.fromarray(arr4, 'L')
-    # dirname = 'images/'
-    # dirname += 'enhanced/' if ENHANCE else 'non_enhanced/'
-    # f_name = file_name.split('.')[0]
-    # im2.save(f'{dirname}/gen_mask_{f_name}.png')
     return bbox, new_mask, failed
 
 
@@ -915,37 +939,188 @@ def gen_metadata_safe(file_path):
     Deals with erroneous metadata generation errors.
     """
     try:
-        return gen_metadata(file_path)
+        result, mask_uint8 = gen_metadata(file_path)
+        return result, mask_uint8
     except Exception as e:
         print(f'{file_path}: Errored out ({e})')
         return {file_path: {'errored': True}}
 
+def show_usage_drexel():
+    
+    print()
+    print(f'Usage : {sys.argv[0]} <file_path> <output.json>/n')
+    print('Version drexel with output format for BGNN')
 
-def main():
-    direct = sys.argv[1]
-    if os.path.isdir(direct):
-        files = [entry.path for entry in os.scandir(direct)]
-        if len(sys.argv) > 2:
-            files = files[:int(sys.argv[2])]
+def main_drexel():
+    """
+    Main function from Drexel version used by Joel and Kevin
+    the result are save automatically in folder and file describe in the following code
+    The input is the extract from argument passed to the fucntion called in command line:
+        Input could be :
+            + folder containing many image files
+            + a single file 
+            + a serie of file
+        output :
+            if multi files, everything is aggregated in adictionary save as "metadata.json"
+            if single file, print the file as pretty print
+    Arguments input :
+        if only one : a folder or a single file
+        if more than 1
+    Returns
+    -------
+    None.
+
+    """
+    # show usage if no argument given
+
+    
+    if len(sys.argv)==2:
+        
+        direct = sys.argv[1]
+        fname = "metadata.json"
+        if os.path.isdir(direct):
+            files = [entry.path for entry in os.scandir(direct)]
+        else:
+            files = [direct]
+            
+   # show usage if wrong number of arguments given
     else:
-        files = [direct]
+        show_usage_drexel()
+        return
+    
     #with Pool(2) as p:
     #    results = p.map(gen_metadata_safe, files)
     results = map(gen_metadata_safe, files)
+    
     output = {}
-    for i in results:
+    for i,mask in results:
         output[list(i.keys())[0]] = list(i.values())[0]
-    fname = f'metadata_{iters}.json' if not JOEL else 'metadata.json'
-    if ENHANCE:
-        fname = 'enhanced_' + fname
-    else:
-        fname = 'non_enhanced_' + fname
-    if len(output) > 1:
-        with open(fname, 'w') as f:
-            json.dump(output, f)
-    else:
-        pprint.pprint(output)
+
+    with open(fname, 'w') as f:
+        json.dump(output, f)
+
+
+def reformat_for_bgnn(result):
+    """
+    Reformat and reduce the size of the result dictionary. 
+    Collect only the data necessary for BGNN minnow project. The new format matches the 
+    BGNN_metadata version. Therefore some of the value not calcualted in drexel version are by 
+    defaulset to "None". 
+
+    Parameters
+    ----------
+    result : dict
+        DESCRIPTION. output from gen_metadata()
+
+    Returns
+    -------
+    bgnn_result : dict
+        DESCRIPTION. {'base_name': xx, 'version':xx, 
+                       'fish': {'fish_num': xx,"bbox":xx, 'pixel_analysis':xx, 'rescale':xx, 
+                            'eye_bbox': xx, 'eye_center':xx , 'angle_degree': xx,
+                            'eye_direction':xx, 'foreground_mean':xx, 'background_mean':xx}, 
+                       'ruler': {'bbox':xx, 'scale':xx, 'unit':xx}}
+
+    """
+    
+    name_base = list(result.keys())[0]
+    first_value = list(result.values())[0]
+    
+    # Fish metadata
+    fish_num = first_value['fish_count']
+    fish_bbox = first_value['fish'][0]['bbox']
+    pixel_analysis = False if first_value['fish'][0]['pixel_analysis_failed'] else True
+    
+    if first_value['fish'][0]['has_eye']:
+        eye_center = first_value['fish'][0]['eye_center']
+    else :
+        eye_center = "None"
+    
+    eye_direction = first_value['fish'][0]['side']
+    foreground_mean = first_value['fish'][0]['foreground']['mean']
+    background_mean = first_value['fish'][0]['background']['mean']
+        
+    dict_fish = {'fish_num': fish_num,"bbox":fish_bbox, 
+                 'pixel_analysis':pixel_analysis, 'rescale':"None", 
+                 'eye_bbox': "None", 'eye_center':eye_center , 'angle_degree': "None",
+                 'eye_direction':eye_direction, 'foreground_mean':foreground_mean, 
+                 'background_mean':background_mean}
+    
+    # Ruler metadata
+    ruler_bbox  = first_value['ruler_bbox'] if first_value['has_ruler'] else "None"
+    scale = first_value['ruler_bbox'] if "scale" in first_value.keys() else "None"
+    unit = first_value['unit'] if "unit" in first_value.keys() else "None"
+    
+    dict_ruler = {'bbox':ruler_bbox, 'scale':scale, 'unit':unit}
+    
+    bgnn_result = {'base_name': name_base, 'version':"from drexel", 
+                   'fish': dict_fish, 'ruler': dict_ruler} 
+    
+    return bgnn_result
+
+
+def main_bgnn(input_file, output_result, output_mask):
+    '''
+    Use the "gen_metadata" through  gen_metadata_safe
+    1- Calculate metadata and mask with gen_metadata()
+    2- Reformat the result to a simplified version for bgnn minnows project
+    3- save the result in outputs (.json amd .png files)
+
+    Parameters
+    ----------
+    file_path : string
+        location of the imae file to analysis.
+    output_json : string
+        path for dictionnary output in json format (expected '/path/to/save/my_output.json').
+    output_mask : string
+        path for mask image output in png format (expected '/path/to/save/my_mask.png').
+
+    Returns
+    -------
+    None.
+
+    '''
+    try:
+        result, mask_uint8 = gen_metadata(input_file)
+        
+        bgnn_result = reformat_for_bgnn(result)
+        
+    except Exception as e:
+            # write the error in the result dictionnary
+            bgnn_result['error'] = f'({e})'
+            print(f'{input_file}: Errored out ({e})')
+            
+    with open(output_result, 'w') as f:
+        json.dump(bgnn_result, f)
+    
+    if output_mask != None:
+        cv2.imwrite(output_mask, mask_uint8)   
+
+
+def show_usage_bgnn():
+    
+    #print()
+    print(f'Usage : {sys.argv[0]} <file_path> <metadata.json> <mask.png>/n')
+    print('Version drexel with output format for BGNN using "main_bgnn()"')
 
 
 if __name__ == '__main__':
-    main()
+    
+    if VERSION == 'drexel':
+        print(f'version : {VERSION}')
+        main_drexel()
+        
+    if VERSION == 'bgnn':
+        if len(sys.argv) == 4:
+            print(f'version : drexel for {VERSION}')
+            input_file = sys.argv[1]
+            output_json = sys.argv[2]
+            output_mask = sys.argv[3]
+            main_bgnn(input_file, output_json, output_mask)
+        else:
+            show_usage_bgnn()
+            
+            
+        
+        
+    

From 43247f810be905e4f2ccde1394519487c3bb8d32 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <thibault.tabarin@gmail.com>
Date: Thu, 29 Sep 2022 11:39:06 -0600
Subject: [PATCH 02/28] add Dockerfile to create image

---
 Dockerfile         | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 config/config.json |  2 +-
 2 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..673fd33
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,73 @@
+FROM ubuntu:20.04
+ARG DATAVERSE_API_TOKEN
+
+# Label
+LABEL org.opencontainers.image.title="metadata generation for fish image drexel version"
+LABEL org.opencontainers.image.authors=" J. Pepper, K. Karmani, T. Tabarin"
+LABEL org.opencontainers.image.source="https://github.com/hdr-bgnn/drexel_metadata"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    wget \
+    build-essential \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    sudo \
+    cmake \
+    ninja-build \
+ && rm -rf /var/lib/apt/lists/*
+
+# Create a working directory
+RUN mkdir /pipeline
+WORKDIR /pipeline
+
+# Create a non-root user and switch to it
+RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+ && chown -R user:user /pipeline
+RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+USER user
+
+# All users can use /home/user as their home directory
+ENV HOME=/home/user
+RUN chmod 777 /home/user
+
+# Set up the Conda environment
+ENV CONDA_AUTO_UPDATE_CONDA=false \
+    PATH=/home/user/miniconda/bin:$PATH
+RUN curl -sLo ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
+ && chmod +x ~/miniconda.sh \
+ && ~/miniconda.sh -b -p ~/miniconda \
+ && rm ~/miniconda.sh \
+ && conda clean -ya
+
+RUN pip install numpy pandas pynrrd pillow scikit-image jedi==0.17.2 opencv-python-headless pyDataverse==0.3.1
+
+# Detectron2 prerequisites
+RUN pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+RUN pip install cython
+RUN pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+# Install detectron2 version 0.6
+RUN python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.8/index.html
+
+# Setup pipeline specific scripts
+ENV PATH="/pipeline:${PATH}"
+
+ADD config/ /pipeline/config/
+ADD gen_metadata.py /pipeline/
+ADD dataverse_download.py /pipeline/
+# Download Drexel Model
+RUN python /pipeline/dataverse_download.py https://covid-commons.osu.edu/ doi:10.5072/FK2/MMX6FY /pipeline/output/
+
+# Set the default command to a usage statement
+CMD echo "Usage generate Metadata drexel version for MinnowsProject: /n"\
+"gen_metadata.py  <fish_image.jpg> <metadata.json> <mask.png>"
diff --git a/config/config.json b/config/config.json
index 757ecd6..55fd5c7 100644
--- a/config/config.json
+++ b/config/config.json
@@ -1,5 +1,5 @@
 {
   "ENHANCE": 1,
   "PROCESSOR": "cpu",
-  "Version":"drexel"
+  "Version":"bgnn"
 }

From 6eca5246d73c76abf1424156e50219db32708737 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <thibault.tabarin@gmail.com>
Date: Thu, 29 Sep 2022 13:00:38 -0600
Subject: [PATCH 03/28] minor fix on Dockerfile and gen_metadata.py

---
 Dockerfile      | 2 +-
 gen_metadata.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 gen_metadata.py

diff --git a/Dockerfile b/Dockerfile
index 673fd33..8c66fce 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,5 +69,5 @@ ADD dataverse_download.py /pipeline/
 RUN python /pipeline/dataverse_download.py https://covid-commons.osu.edu/ doi:10.5072/FK2/MMX6FY /pipeline/output/
 
 # Set the default command to a usage statement
-CMD echo "Usage generate Metadata drexel version for MinnowsProject: /n"\
+CMD echo "Usage generate Metadata drexel version for Minnows Project:\n"\
 "gen_metadata.py  <fish_image.jpg> <metadata.json> <mask.png>"
diff --git a/gen_metadata.py b/gen_metadata.py
old mode 100644
new mode 100755
index 9ea89d9..2283ee9
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -948,7 +948,7 @@ def gen_metadata_safe(file_path):
 def show_usage_drexel():
     
     print()
-    print(f'Usage : {sys.argv[0]} <file_path> <output.json>/n')
+    print(f'Usage : {sys.argv[0]} <file_path> <output.json>\n')
     print('Version drexel with output format for BGNN')
 
 def main_drexel():
@@ -1043,8 +1043,8 @@ def reformat_for_bgnn(result):
     dict_fish = {'fish_num': fish_num,"bbox":fish_bbox, 
                  'pixel_analysis':pixel_analysis, 'rescale':"None", 
                  'eye_bbox': "None", 'eye_center':eye_center , 'angle_degree': "None",
-                 'eye_direction':eye_direction, 'foreground_mean':foreground_mean, 
-                 'background_mean':background_mean}
+                 'eye_direction':eye_direction, 'foreground_mean':round(foreground_mean,2), 
+                 'background_mean':round(background_mean,2)}
     
     # Ruler metadata
     ruler_bbox  = first_value['ruler_bbox'] if first_value['has_ruler'] else "None"
@@ -1100,7 +1100,7 @@ def main_bgnn(input_file, output_result, output_mask):
 def show_usage_bgnn():
     
     #print()
-    print(f'Usage : {sys.argv[0]} <file_path> <metadata.json> <mask.png>/n')
+    print(f'Usage : {sys.argv[0]} <file_path> <metadata.json> <mask.png>\n')
     print('Version drexel with output format for BGNN using "main_bgnn()"')
 
 

From 402c3971003a7d358917d02142703330eef17bb1 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Thu, 29 Sep 2022 16:22:02 -0600
Subject: [PATCH 04/28] Create deploy-image.yml

---
 .github/workflows/deploy-image.yml | 43 ++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 .github/workflows/deploy-image.yml

diff --git a/.github/workflows/deploy-image.yml b/.github/workflows/deploy-image.yml
new file mode 100644
index 0000000..9c9259b
--- /dev/null
+++ b/.github/workflows/deploy-image.yml
@@ -0,0 +1,43 @@
+name: Create and publish a Docker image
+
+on:
+  push:
+    branches: ['release']
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        with:
+          context: .
+          build-args: |
+            DATAVERSE_API_TOKEN=${{ secrets.DATAVERSE_API_TOKEN }}
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}

From 021c7ca255261a940daa4d90c2fd185d9d7ee657 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <thibault.tabarin@gmail.com>
Date: Thu, 29 Sep 2022 20:00:09 -0600
Subject: [PATCH 05/28] fix in development

---
 gen_metadata.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index 2283ee9..1bc08c3 100755
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -40,7 +40,7 @@
 from scipy import stats
 from skimage import filters, measure
 from skimage.morphology import flood_fill
-from torch.multiprocessing import Pool
+#from torch.multiprocessing import Pool
 import warnings
 warnings.filterwarnings("ignore")
 # torch.multiprocessing.set_start_method('forkserver')
@@ -55,8 +55,8 @@
 VERSION = conf['Version'] # option changeable in the config file : "drexel" or "bgnn"
 IOU_PCT = .02
 
-with open('config/mask_rcnn_R_50_FPN_3x.yaml', 'r') as f:
-    iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
+#with open(os.path.join(root_file_path,'config/mask_rcnn_R_50_FPN_3x.yaml'), 'r') as f:
+    #iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
 
 
 def init_model(processor=PROCESSOR):
@@ -71,7 +71,7 @@ def init_model(processor=PROCESSOR):
     cfg.merge_from_file(os.path.join(root_file_path,'config/mask_rcnn_R_50_FPN_3x.yaml'))
     cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
     OUTPUT_DIR = os.path.join(root_file_path, 'output')
-    cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
+    cfg.MODEL.WEIGHTS = os.path.join(OUTPUT_DIR, "model_final.pth")
     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
     cfg.MODEL.DEVICE = processor
     predictor = DefaultPredictor(cfg)

From 95f5ba4f8081f7441bcd6ccce26cb67b81c3ba39 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Fri, 30 Sep 2022 10:52:42 -0600
Subject: [PATCH 06/28] Update README.md

---
 README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d0b7883..f347643 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
 # Drexel Metadata
 
 ## Goal
-To develop a tool to check the validity of metadata associated with an image, and generate things that are missing. Also includes various geometric and statistical properties on the mask generated over the biological specimen presented.
+
+The objective of this repos is to present a methodology using Neural network and classic image processing to extract automatically metadata information from fish image coming from museum. The code was initially develop by Joel Pepper and Kevin karnani who essential train a neural network model (detectron2) to identify object in the image just as fish, fish eye, scale and number on the scale ("2', "3"). In second phase "pixel analysis" technique was used to refine the mask detection around the fish. In a third phase, various classic image processing techniques were used to calculate dimension and various properties on each objects such as (fish bounding box, fish orientation, eye orientation, scale bar value....)
 
 ## Functionality
 
-Object detection is currently being performed on 5 detection classes (fish, fish eyes, rulers, and the twos and threes found on rulers). The current setup is performed on the INHS and UWZM biological specimen image repositories.
+Object detection (detectron2) is currently being performed on 5 detection classes (fish, fish eyes, rulers, and the twos and threes found on rulers). The current setup is performed on the INHS and UWZM biological specimen image repositories.
 
-### Current Criteria
+### Current Criteria on which the model has been develop
 
 1. Image must contain a fish species (no eels, seashells, butterflies, seahorses,
 snakes, etc).
@@ -20,6 +21,15 @@ in training set).
 7. Fish body must not be folded and should have no curvature.
 
 These do not need to be adhered to if properly set up/modified for a specific use case.
+The model is available on data commons osc as "Drexel_metadata_generator" at https://datacommons.tdai.osu.edu/dataverse/fish-traits/ 
+However this model is not published yet. To download you need an acount on datacommons.tdai.osu and you will need to create a DATAVERS_API_TOKEN 
+and you can use :
+```
+python dataverse_download.py https://covid-commons.osu.edu/ doi:10.5072/FK2/MMX6FY output/
+```
+Or you can do it manually on the web page  https://datacommons.tdai.osu.edu/dataset.xhtml?persistentId=doi%3A10.5072%2FFK2%2FMMX6FY&version=DRAFT
+
+If you don't have an account the model is usable via the container iamge locatated in this repo in package.
 
 ### Dependencies
 
@@ -65,10 +75,18 @@ The metadata generated is extremely specific to our use case. In addition, we pe
 3. Contrast enhancement (CLAHE)
 
 The metadata generated produces various statistical and geometric properties of a biological specimen image or collection in a JSON format. When a single file is passed, the data is yielded to the console (stdout). When a directory is passed, the data is stored in a JSON file.
+There is current ly 2 versions on the drexel :
+
+### 1- Original version: developped by Kevin Karnani and Joel Pepper: 
+
+To activate this version in the config/config.json file set "Version" to "drexel"
+'''
+"Version":"drexel"
+'''
 
 To generate the metadata, run the following command:
-```bash
-pipenv run python3 gen_metadata.py [file_or_dir_name]
+```
+python gen_metadata.py [file_or_dir_name]
 ```
 
 ## Properties Generated
@@ -114,6 +132,43 @@ pipenv run python3 gen_metadata.py [file_or_dir_name]
 | solidity             | Per Fish                 | Float             | The ratio of pixels in the fish to pixels of the convex hull image.                                                              |
 | std             | Per Fish                 | Float             | The standard deviation of the mask pixel coordinate distribution. |
 
+
+### 2- BGNN version : adaption by Thibault Tabarin
+
+In this version, we have reshape the output format and simplying it to match the requirement of the [BGNN_Snakemake workflow](https://github.com/hdr-bgnn/BGNN_Snakemake).
+
+To activate this version in the config/config.json file set "Version" to "bgnn"
+'''
+"Version":"bgnn"
+'''
+
+Usage:
+```
+python gen_metadata.py <input_file> <metadata.json> <mask.png>
+```
+
+#### metadata.json
+| Key                   | Association   | Type    | Explanation                                                            |
+|:----------------------|:--------------|:--------|:-----------------------------------------------------------------------|
+| Base_name             | Overall image | string  | image name without extension                                           |
+| version               | Overall image | stirng  | explicitly indicate if output from drexel code or BGNN_metadata        |
+| fish                  | Fish          | dict    | collect metadata of the "main fish" with the highest score             |
+| fish.fish_num         | Fish          | int     | number of fish detected in the image                                   |
+| fish.bbox             | Fish          | list    | Bounding box of the main fish [left,top,right,bottom]                  |
+| fish.pixel_analysis   | Fish          | boolean | If pixel analysis succeeded True, else False                           |
+| fish.rescale          | Fish          | string  | Indicate if “rescale” was used to detect the eye                       |
+| fish.eye_bbox         | Fish          | list    | Bounding box of the eye in the main fish [l,t,r,b]                     |
+| fish.angle_degree     | Fish          | float   | angle of the PCA of the mask                                           |
+| fish.eye_direction    | Fish          | string  | eye facing left or righ                                                |
+| fish. foreground_mean | Fish          | float   | Average of pixel value inside the mask                                 |
+| fish.foreground_std   | Fish          | float   | Standart deviation of pixel value inside the mask                      |
+| ruler                 | Ruler         | dict    | collect metadata of the ruler                                          |
+| ruler.bbox            | Ruler         | list    | Bounding box of the ruler [left,top,right,bottom]                      |
+| ruler.scale           | Ruler         | float   | pixel/unit (distance between number "2" and "3" corrected by the unit) |
+| ruler.unit            | Ruler         | string  | indicate unit (cm or inch) in which the scale is express (pixel/cm)    |
+
+#### Mask.png
+
 ## Authors
 
 Joel Pepper

From 9f5ec05b58c012abd6b555405a368195c9a674c8 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <thibault.tabarin@gmail.com>
Date: Fri, 30 Sep 2022 10:55:41 -0600
Subject: [PATCH 07/28] fix scale value in gen_metadata.py and add .csv with
 properties description

---
 Metadata_bgnn_properties.csv | 17 +++++++++++++++++
 gen_metadata.py              |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 Metadata_bgnn_properties.csv

diff --git a/Metadata_bgnn_properties.csv b/Metadata_bgnn_properties.csv
new file mode 100644
index 0000000..67cef49
--- /dev/null
+++ b/Metadata_bgnn_properties.csv
@@ -0,0 +1,17 @@
+Key,Association,Type ,Explanation
+Base_name,Overall image,string,image name without extension
+version,Overall image,stirng,explicitly indicate if output from drexel code or BGNN_metadata
+fish,Fish,dict,"collect metadata of the ""main fish"" with the highest score"
+fish.fish_num,Fish,int,number of fish detected in the image
+fish.bbox,Fish,list,"Bounding box of the main fish [left,top,right,bottom]"
+fish.pixel_analysis,Fish,boolean,"If pixel analysis succeeded True, else False"
+fish.rescale,Fish,string,Indicate if “rescale” was used to detect the eye
+fish.eye_bbox,Fish,list,"Bounding box of the eye in the main fish [l,t,r,b]"
+fish.angle_degree,Fish,float,angle of the PCA of the mask 
+fish.eye_direction,Fish,string,eye facing left or righ
+fish. foreground_mean,Fish,float,Average of pixel value inside the mask
+fish.foreground_std,Fish,float,Standart deviation of pixel value inside the mask
+ruler,Ruler,dict,collect metadata of the ruler
+ruler.bbox,Ruler,list,"Bounding box of the ruler [left,top,right,bottom]"
+ruler.scale,Ruler,float,"pixel/unit (distance between number ""2"" and ""3"" corrected by the unit)"
+ruler.unit,Ruler,string,indicate unit (cm or inch) in which the scale is express (pixel/cm)
diff --git a/gen_metadata.py b/gen_metadata.py
index 1bc08c3..1f5b769 100755
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -1048,7 +1048,7 @@ def reformat_for_bgnn(result):
     
     # Ruler metadata
     ruler_bbox  = first_value['ruler_bbox'] if first_value['has_ruler'] else "None"
-    scale = first_value['ruler_bbox'] if "scale" in first_value.keys() else "None"
+    scale = round(first_value['scale'],2) if "scale" in first_value.keys() else "None"
     unit = first_value['unit'] if "unit" in first_value.keys() else "None"
     
     dict_ruler = {'bbox':ruler_bbox, 'scale':scale, 'unit':unit}

From 180eb4e288890e54972b9a2ae7308d4ba029e0f1 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Fri, 30 Sep 2022 11:25:35 -0600
Subject: [PATCH 08/28] Update README.md

---
 README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/README.md b/README.md
index f347643..eac22bc 100644
--- a/README.md
+++ b/README.md
@@ -169,9 +169,42 @@ python gen_metadata.py <input_file> <metadata.json> <mask.png>
 
 #### Mask.png
 
+## Container and usage
+
+We use github action to create a docker image base on the Dockerfile. the containerized version is the bgnn format (in config/config.json "Version":"bgnn")
+
+To use the container:
+1- pull the image 
+```
+docker pull ghcr.io/hdr-bgnn/drexel_metadata:release
+```
+or 
+```
+singularity pull docker://ghcr.io/hdr-bgnn/drexel_metadata:release
+```
+
+2- execute the contianer with singularity
+```
+singularity exec drexel_metadata_release.sif python /pipeline/gen_metadata.py <input_file> <metadata.json> <mask.png>
+```
+
+## Alternative repository [BGNN_metadata](https://github.com/thibaulttabarin/BGNN_metadata/) 
+
+In this other repo we have refactorized the code to increase readability and help further development. We removed unused part (stemming from development), integrated more appropriate libraries. We reoprganised the folders structure and imporve documentation. This repository is "work in progress", the training folder is marked "to be done".
+During the improvement (refactorization and restructuration), we modify the original repository too much, therefore we decided to create in parallele this [BGNN_metadata](https://github.com/thibaulttabarin/BGNN_metadata/).
+If the existence of repositiory improve usability and further development and this original repository is not maintained, we should consider switch to this new repos version.
+
+## Associated Publication
+
+[Joel Pepper et al.](https://ieeexplore.ieee.org/document/9651834)
+
+[Kevin Karnani et al.] (https://assets.researchsquare.com/files/rs-1506561/v1_covered.pdf?c=1651071974)
+
 ## Authors
 
 Joel Pepper
 
 Kevin Karnani
 
+Thibault Tabarin
+

From 0c0795e4e80f611db442da0fa6113286bbd2e2bf Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 11:29:23 -0700
Subject: [PATCH 09/28] Update gen_metadata.py

---
 gen_metadata.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index 1f5b769..850ebb8 100755
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -40,10 +40,9 @@
 from scipy import stats
 from skimage import filters, measure
 from skimage.morphology import flood_fill
-#from torch.multiprocessing import Pool
 import warnings
-warnings.filterwarnings("ignore")
-# torch.multiprocessing.set_start_method('forkserver')
+# remove harmless warning  intrinsic to the code structure 
+warnings.filterwarnings("ignore") 
 
 # ensure the look at the right place for the configuration file
 root_file_path = os.path.dirname(__file__)

From caa1a5d264c0e7f63a5016fec88e9bc88225f754 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 11:48:55 -0700
Subject: [PATCH 10/28] Delete dataverse_download.py

---
 dataverse_download.py | 71 -------------------------------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 dataverse_download.py

diff --git a/dataverse_download.py b/dataverse_download.py
deleted file mode 100644
index 3f18151..0000000
--- a/dataverse_download.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/local/bin/python
-# Script to download a dataset from a Dataverse (https://dataverse.org/)
-#' @author
-#' John Bradley: initial code
-#' Thibault Tabarin: small modification
-#' 
-
-
-import os
-import sys
-import hashlib
-from pyDataverse.api import NativeApi, DataAccessApi
-
-
-def download_dataset(base_url, api_token, doi, directory_output):
-    api = NativeApi(base_url, api_token)
-    data_api = DataAccessApi(base_url, api_token)
-    dataset = api.get_dataset(doi)
-    files_list = dataset.json()['data']['latestVersion']['files']
-    for dv_file in files_list:
-        filepath = download_file(data_api, dv_file, directory_output)
-        verify_checksum(dv_file, filepath)
-
-
-def download_file(data_api, dv_file, directory_output):
-    filepath = dv_file["dataFile"]["filename"]
-    #directory_label = dv_file["directoryLabel"]
-    os.makedirs(directory_output, exist_ok=True)
-    filepath = os.path.join(directory_output, filepath)
-    file_id = dv_file["dataFile"]["id"]
-    print("Downloading file {}, id {}".format(filepath, file_id))
-    response = data_api.get_datafile(file_id)
-    with open(filepath, "wb") as f:
-        f.write(response.content)
-    return filepath
-
-
-def verify_checksum(dv_file, filepath):
-    checksum = dv_file["dataFile"]["checksum"]
-    checksum_type = checksum["type"]
-    checksum_value = checksum["value"]
-    if checksum_type != "MD5":
-        raise ValueError(f"Unsupported checksum type {checksum_type}")
-
-    with open(filepath, 'rb') as infile:
-        hash = hashlib.md5(infile.read()).hexdigest()
-        if checksum_value == hash:
-            print("Verified file checksum for {filepath}.")
-        else:
-            raise ValueError(f"Hash value mismatch for {filepath}: {checksum_value} vs {hash} ")
-
-
-def show_usage():
-   print()
-   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi> <directory_output>\n")
-   print("To specify a API token set the DATAVERSE_API_TOKEN environment variable.\n")
-   print("To set the environment variable : export DATAVERSE_API_TOKEN=<my_token>")
-   print()
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 4:
-         show_usage()
-         sys.exit(1)
-    else:
-        BASE_URL = sys.argv[1]
-        DOI = sys.argv[2]
-        directory_output = sys.argv[3]
-        API_TOKEN = os.environ.get('DATAVERSE_API_TOKEN')
-        #print(API_TOKEN)
-        download_dataset(BASE_URL, API_TOKEN, DOI, directory_output)

From 849b5641d697e472e4c74f8b3333bd5524fcea88 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 14:18:33 -0700
Subject: [PATCH 11/28] Create dataverse_download.py

just a clone the download_dataverse.py from original repo
https://github.com/hdr-bgnn/BGNN-trait-segmentation/blob/main/Segment_mini/scripts/dataverse_download.py
Add few info in the header
Co-authored-by: @johnbradley
---
 dataverse_download.py | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 dataverse_download.py

diff --git a/dataverse_download.py b/dataverse_download.py
new file mode 100644
index 0000000..ebbf65a
--- /dev/null
+++ b/dataverse_download.py
@@ -0,0 +1,79 @@
+# Script to download a dataset from a Dataverse (https://dataverse.org/)
+# author : John Bradley
+# "Clone" from https://github.com/hdr-bgnn/BGNN-trait-segmentation/blob/main/Segment_mini/scripts/dataverse_download.py
+# Description : download a file from dataverse on osc
+# Provide template for further development and help user start
+# Provide various functions to navigate the metadata and file on the dataverse.
+# This version require the API_DATAVERSE_TOKEN
+
+import os
+import sys
+import hashlib
+from pyDataverse.api import NativeApi, DataAccessApi
+
+
+def download_file_in_dataset(base_url, api_token, doi, src, dest):
+    api = NativeApi(base_url, api_token)
+    data_api = DataAccessApi(base_url, api_token)
+    dataset = api.get_dataset(doi)
+    files_list = dataset.json()['data']['latestVersion']['files']
+    for dv_file in files_list:
+        remote_path = get_directory_path(dv_file)
+        if remote_path == src:
+            os.makedirs(os.path.dirname(dest), exist_ok=True)
+            filepath = download_file(data_api, dv_file, dest)
+            verify_checksum(dv_file, dest)
+            return
+    raise ValueError(f"Unable to find path {src} within {doi}.")
+
+
+def get_directory_path(dv_file):
+    directory_label = dv_file.get("directoryLabel")
+    filename = dv_file["dataFile"]["filename"]
+    if directory_label:
+       return f"{directory_label}/{filename}"
+    return filename
+
+
+def download_file(data_api, dv_file, filepath):
+    file_id = dv_file["dataFile"]["id"]
+    print("Downloading file {}, id {}".format(filepath, file_id))
+    response = data_api.get_datafile(file_id)
+    with open(filepath, "wb") as f:
+        f.write(response.content)
+    return filepath
+
+
+def verify_checksum(dv_file, filepath):
+    checksum = dv_file["dataFile"]["checksum"]
+    checksum_type = checksum["type"]
+    checksum_value = checksum["value"]
+    if checksum_type != "MD5":
+        raise ValueError(f"Unsupported checksum type {checksum_type}")
+
+    with open(filepath, 'rb') as infile:
+        hash = hashlib.md5(infile.read()).hexdigest()
+        if checksum_value == hash:
+            print(f"Verified file checksum for {filepath}.")
+        else:
+            raise ValueError(f"Hash value mismatch for {filepath}: {checksum_value} vs {hash} ")
+
+
+def show_usage():
+   print()
+   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi>\n")
+   print("To specify an API token set the DATAVERSE_API_TOKEN environment variable.")
+   print()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 5:
+         show_usage()
+         sys.exit(1)
+    else:
+         base_url = sys.argv[1]
+         doi = sys.argv[2]
+         source = sys.argv[3]
+         dest = sys.argv[4]
+         api_token = os.environ.get('DATAVERSE_API_TOKEN')
+         download_file_in_dataset(base_url, api_token, doi, source, dest)

From 9d616a6b32a2b2786455e05a09dc799498a38af7 Mon Sep 17 00:00:00 2001
From: thibaulttabarin <50921014+thibaulttabarin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 14:36:53 -0700
Subject: [PATCH 12/28] Update dataverse_download.py

Modification for drexel metadata version
---
 dataverse_download.py | 54 ++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/dataverse_download.py b/dataverse_download.py
index ebbf65a..e3bdbf0 100644
--- a/dataverse_download.py
+++ b/dataverse_download.py
@@ -1,6 +1,8 @@
 # Script to download a dataset from a Dataverse (https://dataverse.org/)
 # author : John Bradley
+# co-author : Thibault Tabarin
 # "Clone" from https://github.com/hdr-bgnn/BGNN-trait-segmentation/blob/main/Segment_mini/scripts/dataverse_download.py
+# Modified from the original clone
 # Description : download a file from dataverse on osc
 # Provide template for further development and help user start
 # Provide various functions to navigate the metadata and file on the dataverse.
@@ -11,31 +13,20 @@
 import hashlib
 from pyDataverse.api import NativeApi, DataAccessApi
 
-
-def download_file_in_dataset(base_url, api_token, doi, src, dest):
+def download_dataset(base_url, api_token, doi, directory_output):
     api = NativeApi(base_url, api_token)
     data_api = DataAccessApi(base_url, api_token)
     dataset = api.get_dataset(doi)
     files_list = dataset.json()['data']['latestVersion']['files']
     for dv_file in files_list:
-        remote_path = get_directory_path(dv_file)
-        if remote_path == src:
-            os.makedirs(os.path.dirname(dest), exist_ok=True)
-            filepath = download_file(data_api, dv_file, dest)
-            verify_checksum(dv_file, dest)
-            return
-    raise ValueError(f"Unable to find path {src} within {doi}.")
-
-
-def get_directory_path(dv_file):
-    directory_label = dv_file.get("directoryLabel")
-    filename = dv_file["dataFile"]["filename"]
-    if directory_label:
-       return f"{directory_label}/{filename}"
-    return filename
-
-
-def download_file(data_api, dv_file, filepath):
+        filepath = download_file(data_api, dv_file, directory_output)
+        verify_checksum(dv_file, filepath)
+        
+def download_file(data_api, dv_file, directory_output):
+    filepath = dv_file["dataFile"]["filename"]
+    #directory_label = dv_file["directoryLabel"]
+    os.makedirs(directory_output, exist_ok=True)
+    filepath = os.path.join(directory_output, filepath)
     file_id = dv_file["dataFile"]["id"]
     print("Downloading file {}, id {}".format(filepath, file_id))
     response = data_api.get_datafile(file_id)
@@ -43,7 +34,6 @@ def download_file(data_api, dv_file, filepath):
         f.write(response.content)
     return filepath
 
-
 def verify_checksum(dv_file, filepath):
     checksum = dv_file["dataFile"]["checksum"]
     checksum_type = checksum["type"]
@@ -58,22 +48,22 @@ def verify_checksum(dv_file, filepath):
         else:
             raise ValueError(f"Hash value mismatch for {filepath}: {checksum_value} vs {hash} ")
 
-
+            
 def show_usage():
    print()
-   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi>\n")
-   print("To specify an API token set the DATAVERSE_API_TOKEN environment variable.")
+   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi> <directory_output>\n")
+   print("To specify a API token set the DATAVERSE_API_TOKEN environment variable.\n")
+   print("To set the environment variable : export DATAVERSE_API_TOKEN=<my_token>")
    print()
 
-
 if __name__ == '__main__':
-    if len(sys.argv) != 5:
+    if len(sys.argv) != 4:
          show_usage()
          sys.exit(1)
     else:
-         base_url = sys.argv[1]
-         doi = sys.argv[2]
-         source = sys.argv[3]
-         dest = sys.argv[4]
-         api_token = os.environ.get('DATAVERSE_API_TOKEN')
-         download_file_in_dataset(base_url, api_token, doi, source, dest)
+        BASE_URL = sys.argv[1]
+        DOI = sys.argv[2]
+        directory_output = sys.argv[3]
+        API_TOKEN = os.environ.get('DATAVERSE_API_TOKEN')
+        #print(API_TOKEN)
+        download_dataset(BASE_URL, API_TOKEN, DOI, directory_output)

From 8cb2853cc9cf20bed9aabcb3d6135e67436d564d Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 19 Oct 2022 14:44:58 -0400
Subject: [PATCH 13/28] Use Pipfile for installing Docker requirements

Changes Dockerfile to build container with the software versions
specified in the Pipefile.

Fixes bug where Pipefile was using an incompatible python version
for the libraries. See https://github.com/hdr-bgnn/drexel_metadata/issues/7#issuecomment-1284405433
---
 Dockerfile | 90 ++++++++++++++++--------------------------------------
 Pipfile    |  2 +-
 2 files changed, 27 insertions(+), 65 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8c66fce..0c82e2b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,73 +1,35 @@
-FROM ubuntu:20.04
+FROM ghcr.io/imageomics/dataverse-access:0.0.3 as model_fetcher
 ARG DATAVERSE_API_TOKEN
+ENV DATAVERSE_URL=https://datacommons.tdai.osu.edu/
+ENV MODEL_DV_DOI=doi:10.5072/FK2/MMX6FY
 
-# Label
-LABEL org.opencontainers.image.title="metadata generation for fish image drexel version"
-LABEL org.opencontainers.image.authors=" J. Pepper, K. Karmani, T. Tabarin"
-LABEL org.opencontainers.image.source="https://github.com/hdr-bgnn/drexel_metadata"
+# Download model_final.pth
+RUN mkdir -p /model \
+    && dva download $MODEL_DV_DOI /model
 
-ARG DEBIAN_FRONTEND=noninteractive
+FROM python:3.8.10-slim-buster
+LABEL "org.opencontainers.image.authors"="John Bradley <john.bradley@duke.edu>"
+LABEL "org.opencontainers.image.description"="Tool to extract metadata information from fish images"
 
-# Install some basic utilities
-RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    wget \
-    build-essential \
-    libglib2.0-0 \
-    libsm6 \
-    libxext6 \
-    libxrender-dev \
-    sudo \
-    cmake \
-    ninja-build \
- && rm -rf /var/lib/apt/lists/*
+# Install build requirements
+RUN apt-get update \
+    && apt-get install -y python3-dev git gcc g++ libgl1-mesa-glx libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
 
-# Create a working directory
-RUN mkdir /pipeline
-WORKDIR /pipeline
-
-# Create a non-root user and switch to it
-RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
- && chown -R user:user /pipeline
-RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
-USER user
-
-# All users can use /home/user as their home directory
-ENV HOME=/home/user
-RUN chmod 777 /home/user
+# Upgrade pip and install pipenv
+RUN pip install --upgrade pip
+RUN pip install pipenv
 
-# Set up the Conda environment
-ENV CONDA_AUTO_UPDATE_CONDA=false \
-    PATH=/home/user/miniconda/bin:$PATH
-RUN curl -sLo ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
- && chmod +x ~/miniconda.sh \
- && ~/miniconda.sh -b -p ~/miniconda \
- && rm ~/miniconda.sh \
- && conda clean -ya
-
-RUN pip install numpy pandas pynrrd pillow scikit-image jedi==0.17.2 opencv-python-headless pyDataverse==0.3.1
-
-# Detectron2 prerequisites
-RUN pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
-RUN pip install cython
-RUN pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
-# Install detectron2 version 0.6
-RUN python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.8/index.html
+WORKDIR /pipeline
+COPY Pipfile /pipeline/.
 
-# Setup pipeline specific scripts
-ENV PATH="/pipeline:${PATH}"
+# Install requirements
+RUN pipenv install --skip-lock --system
 
-ADD config/ /pipeline/config/
-ADD gen_metadata.py /pipeline/
-ADD dataverse_download.py /pipeline/
-# Download Drexel Model
-RUN python /pipeline/dataverse_download.py https://covid-commons.osu.edu/ doi:10.5072/FK2/MMX6FY /pipeline/output/
+COPY config /pipeline/config
+COPY --from=model_fetcher /model/cache/torch/hub/checkpoints/model_final.pth \
+                          /pipeline/output/enhanced/model_final.pth
+COPY gen_metadata.py /pipeline
 
-# Set the default command to a usage statement
-CMD echo "Usage generate Metadata drexel version for Minnows Project:\n"\
-"gen_metadata.py  <fish_image.jpg> <metadata.json> <mask.png>"
+COPY test.sh /pipeline
+CMD bash test.sh
diff --git a/Pipfile b/Pipfile
index 22bdcbb..dc86b76 100644
--- a/Pipfile
+++ b/Pipfile
@@ -31,4 +31,4 @@ pycallgraph = "*"
 [dev-packages]
 
 [requires]
-python_version = "3.10"
+python_version = "3.8.10"

From 03a95c1486a7fbe8da3a91d1a4956765a45bbd7d Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 19 Oct 2022 14:48:13 -0400
Subject: [PATCH 14/28] Simplify modifications to gen_metadata.py

Adds command line arguments for changes necessary to gen_metadata.py.
Removes logic that transforms the output file.
---
 Dockerfile            |   3 +-
 config/config.json    |   3 +-
 dataverse_download.py |  69 ---------
 gen_metadata.py       | 332 ++++++++++++------------------------------
 4 files changed, 99 insertions(+), 308 deletions(-)
 delete mode 100644 dataverse_download.py
 mode change 100755 => 100644 gen_metadata.py

diff --git a/Dockerfile b/Dockerfile
index 0c82e2b..120f169 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,5 +31,4 @@ COPY --from=model_fetcher /model/cache/torch/hub/checkpoints/model_final.pth \
                           /pipeline/output/enhanced/model_final.pth
 COPY gen_metadata.py /pipeline
 
-COPY test.sh /pipeline
-CMD bash test.sh
+CMD echo "python gen_metadata.py"
diff --git a/config/config.json b/config/config.json
index 55fd5c7..8b8406f 100644
--- a/config/config.json
+++ b/config/config.json
@@ -1,5 +1,4 @@
 {
   "ENHANCE": 1,
-  "PROCESSOR": "cpu",
-  "Version":"bgnn"
+  "JOEL": 0
 }
diff --git a/dataverse_download.py b/dataverse_download.py
deleted file mode 100644
index e3bdbf0..0000000
--- a/dataverse_download.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Script to download a dataset from a Dataverse (https://dataverse.org/)
-# author : John Bradley
-# co-author : Thibault Tabarin
-# "Clone" from https://github.com/hdr-bgnn/BGNN-trait-segmentation/blob/main/Segment_mini/scripts/dataverse_download.py
-# Modified from the original clone
-# Description : download a file from dataverse on osc
-# Provide template for further development and help user start
-# Provide various functions to navigate the metadata and file on the dataverse.
-# This version require the API_DATAVERSE_TOKEN
-
-import os
-import sys
-import hashlib
-from pyDataverse.api import NativeApi, DataAccessApi
-
-def download_dataset(base_url, api_token, doi, directory_output):
-    api = NativeApi(base_url, api_token)
-    data_api = DataAccessApi(base_url, api_token)
-    dataset = api.get_dataset(doi)
-    files_list = dataset.json()['data']['latestVersion']['files']
-    for dv_file in files_list:
-        filepath = download_file(data_api, dv_file, directory_output)
-        verify_checksum(dv_file, filepath)
-        
-def download_file(data_api, dv_file, directory_output):
-    filepath = dv_file["dataFile"]["filename"]
-    #directory_label = dv_file["directoryLabel"]
-    os.makedirs(directory_output, exist_ok=True)
-    filepath = os.path.join(directory_output, filepath)
-    file_id = dv_file["dataFile"]["id"]
-    print("Downloading file {}, id {}".format(filepath, file_id))
-    response = data_api.get_datafile(file_id)
-    with open(filepath, "wb") as f:
-        f.write(response.content)
-    return filepath
-
-def verify_checksum(dv_file, filepath):
-    checksum = dv_file["dataFile"]["checksum"]
-    checksum_type = checksum["type"]
-    checksum_value = checksum["value"]
-    if checksum_type != "MD5":
-        raise ValueError(f"Unsupported checksum type {checksum_type}")
-
-    with open(filepath, 'rb') as infile:
-        hash = hashlib.md5(infile.read()).hexdigest()
-        if checksum_value == hash:
-            print(f"Verified file checksum for {filepath}.")
-        else:
-            raise ValueError(f"Hash value mismatch for {filepath}: {checksum_value} vs {hash} ")
-
-            
-def show_usage():
-   print()
-   print(f"Usage: python {sys.argv[0]} <dataverse_base_url> <doi> <directory_output>\n")
-   print("To specify a API token set the DATAVERSE_API_TOKEN environment variable.\n")
-   print("To set the environment variable : export DATAVERSE_API_TOKEN=<my_token>")
-   print()
-
-if __name__ == '__main__':
-    if len(sys.argv) != 4:
-         show_usage()
-         sys.exit(1)
-    else:
-        BASE_URL = sys.argv[1]
-        DOI = sys.argv[2]
-        directory_output = sys.argv[3]
-        API_TOKEN = os.environ.get('DATAVERSE_API_TOKEN')
-        #print(API_TOKEN)
-        download_dataset(BASE_URL, API_TOKEN, DOI, directory_output)
diff --git a/gen_metadata.py b/gen_metadata.py
old mode 100755
new mode 100644
index 850ebb8..1507e20
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -1,24 +1,3 @@
-#!/usr/local/bin/python
-#' Adapatation for BGNN snakemake (minnows project)
-#  on Mon Aug  8 11:35:50 2022
-#' @author
-#' Joel Pepper: initial code
-#' Kevin Karnani: modified it
-#' Thibault Tabarin: modify for minnow project
-
-#' @description
-#' Minnows version original code developped by Joel Pepper and Kevin Karnani'
-#' Using the detectron2 framework from Facebook, we detect fish, eye and ruler object and
-#' collect metadata information such bounding box, fish mask, eye center, fish orientation...
-#' The added version of the ouput are
-#' Dictionnary with metadata:  {"base_name": "", "fish":
-#' {"fish_num": , "bbox": [], "pixel_analysis": true, "eye_bbox": [], "eye_center": [],
-#' "angle_degree": 9.070674226380035, "eye_direction": "left", "foreground_mean": ,
-#' "foreground_std": , "background_mean": , "background_std": },
-#' "ruler": {"bbox": [], "scale": , "unit": ""}}
-#' And fish mask (binary image)
-
-
 import json
 import math
 import os
@@ -26,6 +5,7 @@
 import sys
 import yaml
 from random import shuffle
+import argparse
 
 import gc
 import torch
@@ -40,45 +20,42 @@
 from scipy import stats
 from skimage import filters, measure
 from skimage.morphology import flood_fill
-import warnings
-# remove harmless warning  intrinsic to the code structure 
-warnings.filterwarnings("ignore") 
+from torch.multiprocessing import Pool
 
-# ensure the look at the right place for the configuration file
-root_file_path = os.path.dirname(__file__)
+# torch.multiprocessing.set_start_method('forkserver')
 
 VAL_SCALE_FAC = 0.5
-conf = json.load(open(os.path.join(root_file_path,'config/config.json'), 'r'))
+conf = json.load(open('config/config.json', 'r'))
 ENHANCE = bool(conf['ENHANCE'])
-PROCESSOR = conf['PROCESSOR']
-VERSION = conf['Version'] # option changeable in the config file : "drexel" or "bgnn"
+JOEL = bool(conf['JOEL'])
 IOU_PCT = .02
 
-#with open(os.path.join(root_file_path,'config/mask_rcnn_R_50_FPN_3x.yaml'), 'r') as f:
-    #iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
+with open('config/mask_rcnn_R_50_FPN_3x.yaml', 'r') as f:
+    iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
 
 
-def init_model(processor=PROCESSOR):
+def init_model(enhance_contrast=ENHANCE, joel=JOEL, device=None):
     """
     Initialize model using config files for RCNN, the trained weights, and other parameters.
 
     Returns:
         predictor -- DefaultPredictor(**configs).
     """
-    root_file_path = os.path.dirname(__file__)
     cfg = get_cfg()
-    cfg.merge_from_file(os.path.join(root_file_path,'config/mask_rcnn_R_50_FPN_3x.yaml'))
+    cfg.merge_from_file("config/mask_rcnn_R_50_FPN_3x.yaml")
     cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
-    OUTPUT_DIR = os.path.join(root_file_path, 'output')
-    cfg.MODEL.WEIGHTS = os.path.join(OUTPUT_DIR, "model_final.pth")
+    if not joel:
+        cfg.OUTPUT_DIR += f"/non_enhanced" if not enhance_contrast else f"/enhanced"
+        # cfg.OUTPUT_DIR += f"/non_enhanced_{iters}" if not enhance_contrast else f"/enhanced_{iters}"
+    cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
-    cfg.MODEL.DEVICE = processor
+    if device:
+       cfg.MODEL.DEVICE = device
     predictor = DefaultPredictor(cfg)
-    
     return predictor
 
 
-def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_fish=False):
+def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_fish=False, device=None, maskfname=None):
     """
     Generates metadata of an image and stores attributes into a Dictionary.
 
@@ -87,7 +64,7 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     Returns:
         {file_name: results} -- dictionary of file and associated results.
     """
-    predictor = init_model()
+    predictor = init_model(device=device)
     im = cv2.imread(file_path)
     im_gray = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
     if enhance_contrast:
@@ -161,18 +138,14 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     if visualize:
         cv2.imshow('prediction', np.array(vis.get_image()[:, :, ::-1], dtype=np.uint8))
         cv2.waitKey(0)
-        
-    if False:    # to save visualization of the prediction       
-        os.makedirs('images', exist_ok=True)
-        os.makedirs('images/enhanced', exist_ok=True)
-        os.makedirs('images/non_enhanced', exist_ok=True)
-        dirname = 'images/'
-        dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
-        print(file_name)
-        cv2.imwrite(f'{dirname}/gen_prediction_{f_name}.png',
-                        vis.get_image()[:, :, ::-1])
-    
-
+    os.makedirs('images', exist_ok=True)
+    os.makedirs('images/enhanced', exist_ok=True)
+    os.makedirs('images/non_enhanced', exist_ok=True)
+    dirname = 'images/'
+    dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
+    print(file_name)
+    cv2.imwrite(f'{dirname}/gen_prediction_{f_name}.png',
+                vis.get_image()[:, :, ::-1])
     skippable_fish = []
     fish_length = 0
     if fish:
@@ -221,10 +194,6 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
             val = adaptive_threshold(bbox, im_gray)
             bbox, mask, pixel_anal_failed = gen_mask(bbox, file_path,
                                                      file_name, im_gray, val, detectron_mask)
-            
-            # Convert the nask to np.uint8 to save it latter
-            mask_uint8 = np.where(mask == 1, 255, 0).astype(np.uint8)
-            
             centroid, evecs, cont_length, cont_width, length, width, area = pca(mask, scale)
             major, minor = evecs[0], evecs[1]
 
@@ -232,6 +201,9 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
                 print('Mask failed: {file_name}')
                 results['errored'] = True
             else:
+                if maskfname:
+                    mask_uint8 = np.where(mask == 1, 255, 0).astype(np.uint8)
+                    cv2.imwrite(maskfname, mask_uint8)
                 im_crop = im_gray[bbox[1]:bbox[3], bbox[0]:bbox[2]].reshape(-1)
                 mask_crop = mask[bbox[1]:bbox[3], bbox[0]:bbox[2]].reshape(-1)
                 mask_coords = np.argwhere(mask != 0)[:, [1, 0]]
@@ -287,7 +259,7 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
                     need_scaling = True
                     factor = 4
                     eye_center, side, clock_val = upscale(
-                        im, bbox, f_name, factor)
+                        im, bbox, f_name, factor, device)
                     if eye_center is not None and side is not None:
                         results['fish'][i]['eye_center'] = eye_center
                         results['fish'][i]['side'] = side
@@ -331,13 +303,13 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     results['fish_count'] = len(insts[(insts.pred_classes == 0).logical_and(insts.scores > 0.3)]) - \
                             len(skippable_fish) if multiple_fish else int(results['has_fish'])
     results['detected_fish_count'] = fish_length
-    return {f_name: results}, mask_uint8
+    return {f_name: results}
 
 
-def gen_metadata_upscale(file_path, fish):
+def gen_metadata_upscale(file_path, fish, device=None):
     gc.collect()
     torch.cuda.empty_cache()
-    predictor = init_model()
+    predictor = init_model(device=device)
     im = fish
     im_gray = cv2.cvtColor(fish, cv2.COLOR_BGR2GRAY)
     output = predictor(im)
@@ -406,14 +378,14 @@ def gen_metadata_upscale(file_path, fish):
     return {f_name: results}
 
 
-def upscale(im, bbox, f_name, factor):
+def upscale(im, bbox, f_name, factor, device):
     h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
     scaled = cv2.resize(im[bbox[1]:bbox[3], bbox[0]:bbox[2]].copy(), (w * factor, h * factor),
                         interpolation=cv2.INTER_CUBIC)
     os.makedirs('images/testing', exist_ok=True)
     cv2.imwrite(f'images/testing/{f_name}.png', scaled)
     eye_center, side, clock_val, scale = None, None, None, None
-    new_data = gen_metadata_upscale(f'images/testing/{f_name}.png', scaled)
+    new_data = gen_metadata_upscale(f'images/testing/{f_name}.png', scaled, device=device)
     if 'fish' in new_data[f'{f_name}'] and new_data[f'{f_name}']['fish'][0]['has_eye']:
         eye_center = new_data[f'{f_name}']['fish'][0]['eye_center']
         eye_x, eye_y = eye_center
@@ -895,6 +867,17 @@ def gen_mask(bbox, file_path, file_name, im_gray, val, detectron_mask, flipped=F
         new_mask = detectron_mask.astype('uint8')
         bbox = bbox_orig
         failed = True
+    # arr4 = np.where(new_mask == 1, 255, 0).astype(np.uint8)
+    # (left, top, right, bottom) = shrink_bbox(new_mask)
+    # arr4[top:bottom, left] = 175
+    # arr4[top:bottom, right] = 175
+    # arr4[top, left:right] = 175
+    # arr4[bottom, left:right] = 175
+    # im2 = Image.fromarray(arr4, 'L')
+    # dirname = 'images/'
+    # dirname += 'enhanced/' if ENHANCE else 'non_enhanced/'
+    # f_name = file_name.split('.')[0]
+    # im2.save(f'{dirname}/gen_mask_{f_name}.png')
     return bbox, new_mask, failed
 
 
@@ -933,193 +916,72 @@ def shrink_bbox(mask):
     return cmin, rmin, cmax, rmax
 
 
-def gen_metadata_safe(file_path):
+def gen_metadata_safe(file_path, maskfname=None, device=None):
     """
     Deals with erroneous metadata generation errors.
     """
     try:
-        result, mask_uint8 = gen_metadata(file_path)
-        return result, mask_uint8
+        return gen_metadata(file_path, maskfname=maskfname, device=device)
     except Exception as e:
         print(f'{file_path}: Errored out ({e})')
         return {file_path: {'errored': True}}
 
-def show_usage_drexel():
-    
-    print()
-    print(f'Usage : {sys.argv[0]} <file_path> <output.json>\n')
-    print('Version drexel with output format for BGNN')
-
-def main_drexel():
-    """
-    Main function from Drexel version used by Joel and Kevin
-    the result are save automatically in folder and file describe in the following code
-    The input is the extract from argument passed to the fucntion called in command line:
-        Input could be :
-            + folder containing many image files
-            + a single file 
-            + a serie of file
-        output :
-            if multi files, everything is aggregated in adictionary save as "metadata.json"
-            if single file, print the file as pretty print
-    Arguments input :
-        if only one : a folder or a single file
-        if more than 1
-    Returns
-    -------
-    None.
 
-    """
-    # show usage if no argument given
-
-    
-    if len(sys.argv)==2:
-        
-        direct = sys.argv[1]
-        fname = "metadata.json"
-        if os.path.isdir(direct):
-            files = [entry.path for entry in os.scandir(direct)]
-        else:
-            files = [direct]
-            
-   # show usage if wrong number of arguments given
+def argument_parser():
+    parser = argparse.ArgumentParser(description='Generate metadata for one or more fish images.')
+    parser.add_argument('file_or_directory',
+                        help='Path to a fish image or a directory of multiple fish images. '
+                             'When one file is passed the JSON metadata is printed to the terminal.')
+    parser.add_argument('limit', type=int, nargs='?',
+                        help='Limit the number of images processed from a directory')
+    parser.add_argument('--outfname',
+                        help='Output filename to use for JSON metadata (disables printing to terminal).')
+    parser.add_argument('--device', choices=['cpu', 'cuda'], default=None,
+                        help='Override the default device used for the ML model.')
+    parser.add_argument('--maskfname',
+                        help='Save a mask image with the provided filename. '
+                             'Only supported when processing a single image file.')
+    return parser
+
+
+def main():
+    parser = argument_parser()
+    args = parser.parse_args()
+    direct = args.file_or_directory
+    if os.path.isdir(direct):
+        files = [entry.path for entry in os.scandir(direct)]
+        if args.limit:
+            files = files[:args.limit]
     else:
-        show_usage_drexel()
-        return
-    
+        files = [direct]
+
     #with Pool(2) as p:
     #    results = p.map(gen_metadata_safe, files)
-    results = map(gen_metadata_safe, files)
-    
+    num_files = len(files)
+    if num_files == 1:
+        results = [gen_metadata_safe(files[0], maskfname=args.maskfname, device=args.device)]
+    else:
+        if args.maskfname:
+            print("Error: The `--maskfname` argument cannot be used with multiple input files.")
+            sys.exit(0)
+        results = map(gen_metadata_safe, files, device=[args.device] * num_files)
     output = {}
-    for i,mask in results:
+    for i in results:
         output[list(i.keys())[0]] = list(i.values())[0]
-
-    with open(fname, 'w') as f:
-        json.dump(output, f)
-
-
-def reformat_for_bgnn(result):
-    """
-    Reformat and reduce the size of the result dictionary. 
-    Collect only the data necessary for BGNN minnow project. The new format matches the 
-    BGNN_metadata version. Therefore some of the value not calcualted in drexel version are by 
-    defaulset to "None". 
-
-    Parameters
-    ----------
-    result : dict
-        DESCRIPTION. output from gen_metadata()
-
-    Returns
-    -------
-    bgnn_result : dict
-        DESCRIPTION. {'base_name': xx, 'version':xx, 
-                       'fish': {'fish_num': xx,"bbox":xx, 'pixel_analysis':xx, 'rescale':xx, 
-                            'eye_bbox': xx, 'eye_center':xx , 'angle_degree': xx,
-                            'eye_direction':xx, 'foreground_mean':xx, 'background_mean':xx}, 
-                       'ruler': {'bbox':xx, 'scale':xx, 'unit':xx}}
-
-    """
-    
-    name_base = list(result.keys())[0]
-    first_value = list(result.values())[0]
-    
-    # Fish metadata
-    fish_num = first_value['fish_count']
-    fish_bbox = first_value['fish'][0]['bbox']
-    pixel_analysis = False if first_value['fish'][0]['pixel_analysis_failed'] else True
-    
-    if first_value['fish'][0]['has_eye']:
-        eye_center = first_value['fish'][0]['eye_center']
-    else :
-        eye_center = "None"
-    
-    eye_direction = first_value['fish'][0]['side']
-    foreground_mean = first_value['fish'][0]['foreground']['mean']
-    background_mean = first_value['fish'][0]['background']['mean']
-        
-    dict_fish = {'fish_num': fish_num,"bbox":fish_bbox, 
-                 'pixel_analysis':pixel_analysis, 'rescale':"None", 
-                 'eye_bbox': "None", 'eye_center':eye_center , 'angle_degree': "None",
-                 'eye_direction':eye_direction, 'foreground_mean':round(foreground_mean,2), 
-                 'background_mean':round(background_mean,2)}
-    
-    # Ruler metadata
-    ruler_bbox  = first_value['ruler_bbox'] if first_value['has_ruler'] else "None"
-    scale = round(first_value['scale'],2) if "scale" in first_value.keys() else "None"
-    unit = first_value['unit'] if "unit" in first_value.keys() else "None"
-    
-    dict_ruler = {'bbox':ruler_bbox, 'scale':scale, 'unit':unit}
-    
-    bgnn_result = {'base_name': name_base, 'version':"from drexel", 
-                   'fish': dict_fish, 'ruler': dict_ruler} 
-    
-    return bgnn_result
-
-
-def main_bgnn(input_file, output_result, output_mask):
-    '''
-    Use the "gen_metadata" through  gen_metadata_safe
-    1- Calculate metadata and mask with gen_metadata()
-    2- Reformat the result to a simplified version for bgnn minnows project
-    3- save the result in outputs (.json amd .png files)
-
-    Parameters
-    ----------
-    file_path : string
-        location of the imae file to analysis.
-    output_json : string
-        path for dictionnary output in json format (expected '/path/to/save/my_output.json').
-    output_mask : string
-        path for mask image output in png format (expected '/path/to/save/my_mask.png').
-
-    Returns
-    -------
-    None.
-
-    '''
-    try:
-        result, mask_uint8 = gen_metadata(input_file)
-        
-        bgnn_result = reformat_for_bgnn(result)
-        
-    except Exception as e:
-            # write the error in the result dictionnary
-            bgnn_result['error'] = f'({e})'
-            print(f'{input_file}: Errored out ({e})')
-            
-    with open(output_result, 'w') as f:
-        json.dump(bgnn_result, f)
-    
-    if output_mask != None:
-        cv2.imwrite(output_mask, mask_uint8)   
-
-
-def show_usage_bgnn():
-    
-    #print()
-    print(f'Usage : {sys.argv[0]} <file_path> <metadata.json> <mask.png>\n')
-    print('Version drexel with output format for BGNN using "main_bgnn()"')
+    if args.outfname:
+       fname = args.outfname
+    else:
+        fname = f'metadata_{iters}.json' if not JOEL else 'metadata.json'
+        if ENHANCE:
+            fname = 'enhanced_' + fname
+        else:
+            fname = 'non_enhanced_' + fname
+    if len(output) > 1 or args.outfname:
+        with open(fname, 'w') as f:
+            json.dump(output, f)
+    else:
+        pprint.pprint(output)
 
 
 if __name__ == '__main__':
-    
-    if VERSION == 'drexel':
-        print(f'version : {VERSION}')
-        main_drexel()
-        
-    if VERSION == 'bgnn':
-        if len(sys.argv) == 4:
-            print(f'version : drexel for {VERSION}')
-            input_file = sys.argv[1]
-            output_json = sys.argv[2]
-            output_mask = sys.argv[3]
-            main_bgnn(input_file, output_json, output_mask)
-        else:
-            show_usage_bgnn()
-            
-            
-        
-        
-    
+    main()

From a129ee2b6cc313c49e2ec9d2ae6bdc640b29a924 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 19 Oct 2022 14:57:25 -0400
Subject: [PATCH 15/28] Only rebuild container on release

Changes github action configuration to deploy a container when a GitHub
release is created.
---
 .github/workflows/deploy-image.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-image.yml b/.github/workflows/deploy-image.yml
index 9c9259b..1d5efbd 100644
--- a/.github/workflows/deploy-image.yml
+++ b/.github/workflows/deploy-image.yml
@@ -1,8 +1,8 @@
 name: Create and publish a Docker image
 
 on:
-  push:
-    branches: ['release']
+  release:
+    types: [published]
 
 env:
   REGISTRY: ghcr.io

From 491c0963da65bf7aaead13f14850136d5a33f92e Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Thu, 20 Oct 2022 09:10:22 -0400
Subject: [PATCH 16/28] Clear pipenv cache

Remove cache created by pipenv to reduce container size.
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 120f169..b8c9997 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,7 +24,7 @@ WORKDIR /pipeline
 COPY Pipfile /pipeline/.
 
 # Install requirements
-RUN pipenv install --skip-lock --system
+RUN pipenv install --skip-lock --system && pipenv --clear
 
 COPY config /pipeline/config
 COPY --from=model_fetcher /model/cache/torch/hub/checkpoints/model_final.pth \

From 8fcfebbcd46a8f4e005c555e50852b70a62764ee Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 24 Oct 2022 09:10:25 -0400
Subject: [PATCH 17/28] Allow specifying the visualization filename

Adds --visfname command line argument that will allow a user
to overide the visualization image filename.
---
 gen_metadata.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index 1507e20..0898644 100644
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -55,7 +55,8 @@ def init_model(enhance_contrast=ENHANCE, joel=JOEL, device=None):
     return predictor
 
 
-def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_fish=False, device=None, maskfname=None):
+def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_fish=False, device=None, maskfname=None,
+                 visfname=None):
     """
     Generates metadata of an image and stores attributes into a Dictionary.
 
@@ -138,14 +139,15 @@ def gen_metadata(file_path, enhance_contrast=ENHANCE, visualize=False, multiple_
     if visualize:
         cv2.imshow('prediction', np.array(vis.get_image()[:, :, ::-1], dtype=np.uint8))
         cv2.waitKey(0)
-    os.makedirs('images', exist_ok=True)
-    os.makedirs('images/enhanced', exist_ok=True)
-    os.makedirs('images/non_enhanced', exist_ok=True)
-    dirname = 'images/'
-    dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
-    print(file_name)
-    cv2.imwrite(f'{dirname}/gen_prediction_{f_name}.png',
-                vis.get_image()[:, :, ::-1])
+    if not visfname:
+        os.makedirs('images', exist_ok=True)
+        os.makedirs('images/enhanced', exist_ok=True)
+        os.makedirs('images/non_enhanced', exist_ok=True)
+        dirname = 'images/'
+        dirname += 'enhanced/' if enhance_contrast else 'non_enhanced/'
+        print(file_name)
+        visfname = f'{dirname}/gen_prediction_{f_name}.png'
+    cv2.imwrite(visfname, vis.get_image()[:, :, ::-1])
     skippable_fish = []
     fish_length = 0
     if fish:
@@ -916,12 +918,12 @@ def shrink_bbox(mask):
     return cmin, rmin, cmax, rmax
 
 
-def gen_metadata_safe(file_path, maskfname=None, device=None):
+def gen_metadata_safe(file_path, maskfname=None, visfname=None, device=None):
     """
     Deals with erroneous metadata generation errors.
     """
     try:
-        return gen_metadata(file_path, maskfname=maskfname, device=device)
+        return gen_metadata(file_path, maskfname=maskfname, visfname=visfname, device=device)
     except Exception as e:
         print(f'{file_path}: Errored out ({e})')
         return {file_path: {'errored': True}}
@@ -941,6 +943,9 @@ def argument_parser():
     parser.add_argument('--maskfname',
                         help='Save a mask image with the provided filename. '
                              'Only supported when processing a single image file.')
+    parser.add_argument('--visfname',
+                        help='Overwrites default visualization filename. '
+                             'Only supported when processing a single image file.')
     return parser
 
 
@@ -959,11 +964,15 @@ def main():
     #    results = p.map(gen_metadata_safe, files)
     num_files = len(files)
     if num_files == 1:
-        results = [gen_metadata_safe(files[0], maskfname=args.maskfname, device=args.device)]
+        results = [gen_metadata_safe(files[0], maskfname=args.maskfname,
+                                     visfname=args.visfname, device=args.device)]
     else:
         if args.maskfname:
             print("Error: The `--maskfname` argument cannot be used with multiple input files.")
             sys.exit(0)
+        if args.visfname:
+            print("error: the `--visfname` argument cannot be used with multiple input files.")
+            sys.exit(0)
         results = map(gen_metadata_safe, files, device=[args.device] * num_files)
     output = {}
     for i in results:

From ae1a0c729051ff08abe85457f6ff65a45e353788 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Tue, 25 Oct 2022 13:26:41 -0400
Subject: [PATCH 18/28] Simplify README and remove unrelated CSV

Removes minnows details from README.
Removes minnows specific CSV file.
---
 Metadata_bgnn_properties.csv | 17 -------
 README.md                    | 99 +++++-------------------------------
 2 files changed, 14 insertions(+), 102 deletions(-)
 delete mode 100644 Metadata_bgnn_properties.csv

diff --git a/Metadata_bgnn_properties.csv b/Metadata_bgnn_properties.csv
deleted file mode 100644
index 67cef49..0000000
--- a/Metadata_bgnn_properties.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Key,Association,Type ,Explanation
-Base_name,Overall image,string,image name without extension
-version,Overall image,stirng,explicitly indicate if output from drexel code or BGNN_metadata
-fish,Fish,dict,"collect metadata of the ""main fish"" with the highest score"
-fish.fish_num,Fish,int,number of fish detected in the image
-fish.bbox,Fish,list,"Bounding box of the main fish [left,top,right,bottom]"
-fish.pixel_analysis,Fish,boolean,"If pixel analysis succeeded True, else False"
-fish.rescale,Fish,string,Indicate if “rescale” was used to detect the eye
-fish.eye_bbox,Fish,list,"Bounding box of the eye in the main fish [l,t,r,b]"
-fish.angle_degree,Fish,float,angle of the PCA of the mask 
-fish.eye_direction,Fish,string,eye facing left or righ
-fish. foreground_mean,Fish,float,Average of pixel value inside the mask
-fish.foreground_std,Fish,float,Standart deviation of pixel value inside the mask
-ruler,Ruler,dict,collect metadata of the ruler
-ruler.bbox,Ruler,list,"Bounding box of the ruler [left,top,right,bottom]"
-ruler.scale,Ruler,float,"pixel/unit (distance between number ""2"" and ""3"" corrected by the unit)"
-ruler.unit,Ruler,string,indicate unit (cm or inch) in which the scale is express (pixel/cm)
diff --git a/README.md b/README.md
index eac22bc..d003d29 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,13 @@
 # Drexel Metadata
 
 ## Goal
-
-The objective of this repos is to present a methodology using Neural network and classic image processing to extract automatically metadata information from fish image coming from museum. The code was initially develop by Joel Pepper and Kevin karnani who essential train a neural network model (detectron2) to identify object in the image just as fish, fish eye, scale and number on the scale ("2', "3"). In second phase "pixel analysis" technique was used to refine the mask detection around the fish. In a third phase, various classic image processing techniques were used to calculate dimension and various properties on each objects such as (fish bounding box, fish orientation, eye orientation, scale bar value....)
+To develop a tool to check the validity of metadata associated with an image, and generate things that are missing. Also includes various geometric and statistical properties on the mask generated over the biological specimen presented.
 
 ## Functionality
 
-Object detection (detectron2) is currently being performed on 5 detection classes (fish, fish eyes, rulers, and the twos and threes found on rulers). The current setup is performed on the INHS and UWZM biological specimen image repositories.
+Object detection is currently being performed on 5 detection classes (fish, fish eyes, rulers, and the twos and threes found on rulers). The current setup is performed on the INHS and UWZM biological specimen image repositories.
 
-### Current Criteria on which the model has been develop
+### Current Criteria
 
 1. Image must contain a fish species (no eels, seashells, butterflies, seahorses,
 snakes, etc).
@@ -21,15 +20,6 @@ in training set).
 7. Fish body must not be folded and should have no curvature.
 
 These do not need to be adhered to if properly set up/modified for a specific use case.
-The model is available on data commons osc as "Drexel_metadata_generator" at https://datacommons.tdai.osu.edu/dataverse/fish-traits/ 
-However this model is not published yet. To download you need an acount on datacommons.tdai.osu and you will need to create a DATAVERS_API_TOKEN 
-and you can use :
-```
-python dataverse_download.py https://covid-commons.osu.edu/ doi:10.5072/FK2/MMX6FY output/
-```
-Or you can do it manually on the web page  https://datacommons.tdai.osu.edu/dataset.xhtml?persistentId=doi%3A10.5072%2FFK2%2FMMX6FY&version=DRAFT
-
-If you don't have an account the model is usable via the container iamge locatated in this repo in package.
 
 ### Dependencies
 
@@ -75,18 +65,20 @@ The metadata generated is extremely specific to our use case. In addition, we pe
 3. Contrast enhancement (CLAHE)
 
 The metadata generated produces various statistical and geometric properties of a biological specimen image or collection in a JSON format. When a single file is passed, the data is yielded to the console (stdout). When a directory is passed, the data is stored in a JSON file.
-There is current ly 2 versions on the drexel :
 
-### 1- Original version: developped by Kevin Karnani and Joel Pepper: 
-
-To activate this version in the config/config.json file set "Version" to "drexel"
-'''
-"Version":"drexel"
-'''
+### Model
+The trained model is available as "Drexel_metadata_generator" at https://datacommons.tdai.osu.edu/dataverse/fish-traits/.
+The model can be downloaded from that website or via the [dva](https://github.com/Imageomics/dataverse-access) command line utility.
+To download from the command line install dva then run the following command:
+```
+dva download --url https://datacommons.tdai.osu.edu/ doi:10.5072/FK2/MMX6FY .
+```
+The above command will download the file and verify the checksum.
 
+### Running
 To generate the metadata, run the following command:
-```
-python gen_metadata.py [file_or_dir_name]
+```bash
+pipenv run python3 gen_metadata.py [file_or_dir_name]
 ```
 
 ## Properties Generated
@@ -132,68 +124,6 @@ python gen_metadata.py [file_or_dir_name]
 | solidity             | Per Fish                 | Float             | The ratio of pixels in the fish to pixels of the convex hull image.                                                              |
 | std             | Per Fish                 | Float             | The standard deviation of the mask pixel coordinate distribution. |
 
-
-### 2- BGNN version : adaption by Thibault Tabarin
-
-In this version, we have reshape the output format and simplying it to match the requirement of the [BGNN_Snakemake workflow](https://github.com/hdr-bgnn/BGNN_Snakemake).
-
-To activate this version in the config/config.json file set "Version" to "bgnn"
-'''
-"Version":"bgnn"
-'''
-
-Usage:
-```
-python gen_metadata.py <input_file> <metadata.json> <mask.png>
-```
-
-#### metadata.json
-| Key                   | Association   | Type    | Explanation                                                            |
-|:----------------------|:--------------|:--------|:-----------------------------------------------------------------------|
-| Base_name             | Overall image | string  | image name without extension                                           |
-| version               | Overall image | stirng  | explicitly indicate if output from drexel code or BGNN_metadata        |
-| fish                  | Fish          | dict    | collect metadata of the "main fish" with the highest score             |
-| fish.fish_num         | Fish          | int     | number of fish detected in the image                                   |
-| fish.bbox             | Fish          | list    | Bounding box of the main fish [left,top,right,bottom]                  |
-| fish.pixel_analysis   | Fish          | boolean | If pixel analysis succeeded True, else False                           |
-| fish.rescale          | Fish          | string  | Indicate if “rescale” was used to detect the eye                       |
-| fish.eye_bbox         | Fish          | list    | Bounding box of the eye in the main fish [l,t,r,b]                     |
-| fish.angle_degree     | Fish          | float   | angle of the PCA of the mask                                           |
-| fish.eye_direction    | Fish          | string  | eye facing left or righ                                                |
-| fish. foreground_mean | Fish          | float   | Average of pixel value inside the mask                                 |
-| fish.foreground_std   | Fish          | float   | Standart deviation of pixel value inside the mask                      |
-| ruler                 | Ruler         | dict    | collect metadata of the ruler                                          |
-| ruler.bbox            | Ruler         | list    | Bounding box of the ruler [left,top,right,bottom]                      |
-| ruler.scale           | Ruler         | float   | pixel/unit (distance between number "2" and "3" corrected by the unit) |
-| ruler.unit            | Ruler         | string  | indicate unit (cm or inch) in which the scale is express (pixel/cm)    |
-
-#### Mask.png
-
-## Container and usage
-
-We use github action to create a docker image base on the Dockerfile. the containerized version is the bgnn format (in config/config.json "Version":"bgnn")
-
-To use the container:
-1- pull the image 
-```
-docker pull ghcr.io/hdr-bgnn/drexel_metadata:release
-```
-or 
-```
-singularity pull docker://ghcr.io/hdr-bgnn/drexel_metadata:release
-```
-
-2- execute the contianer with singularity
-```
-singularity exec drexel_metadata_release.sif python /pipeline/gen_metadata.py <input_file> <metadata.json> <mask.png>
-```
-
-## Alternative repository [BGNN_metadata](https://github.com/thibaulttabarin/BGNN_metadata/) 
-
-In this other repo we have refactorized the code to increase readability and help further development. We removed unused part (stemming from development), integrated more appropriate libraries. We reoprganised the folders structure and imporve documentation. This repository is "work in progress", the training folder is marked "to be done".
-During the improvement (refactorization and restructuration), we modify the original repository too much, therefore we decided to create in parallele this [BGNN_metadata](https://github.com/thibaulttabarin/BGNN_metadata/).
-If the existence of repositiory improve usability and further development and this original repository is not maintained, we should consider switch to this new repos version.
-
 ## Associated Publication
 
 [Joel Pepper et al.](https://ieeexplore.ieee.org/document/9651834)
@@ -207,4 +137,3 @@ Joel Pepper
 Kevin Karnani
 
 Thibault Tabarin
-

From 4420e5b109c58957eb4fd739582ae45d70317f59 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 26 Oct 2022 08:45:11 -0400
Subject: [PATCH 19/28] Fix map bug

The map function was failing due to keyword arguments.
The fix is to use positional argument to pass the device type.
---
 gen_metadata.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index 0898644..f3530b4 100644
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -918,12 +918,12 @@ def shrink_bbox(mask):
     return cmin, rmin, cmax, rmax
 
 
-def gen_metadata_safe(file_path, maskfname=None, visfname=None, device=None):
+def gen_metadata_safe(file_path, device=None, maskfname=None, visfname=None):
     """
     Deals with erroneous metadata generation errors.
     """
     try:
-        return gen_metadata(file_path, maskfname=maskfname, visfname=visfname, device=device)
+        return gen_metadata(file_path, device=device, maskfname=maskfname, visfname=visfname)
     except Exception as e:
         print(f'{file_path}: Errored out ({e})')
         return {file_path: {'errored': True}}
@@ -973,7 +973,7 @@ def main():
         if args.visfname:
             print("error: the `--visfname` argument cannot be used with multiple input files.")
             sys.exit(0)
-        results = map(gen_metadata_safe, files, device=[args.device] * num_files)
+        results = map(gen_metadata_safe, files, [args.device] * num_files)
     output = {}
     for i in results:
         output[list(i.keys())[0]] = list(i.values())[0]

From 8b64f0bc343efdd839bec9204afa85b59b670187 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 26 Oct 2022 14:16:40 -0400
Subject: [PATCH 20/28] Update README.md with command line arguments

I also removed `Thibault Tabarin` from the `Authors` section of the README since our changes are now rather minimal.
---
 README.md | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d003d29..dbad82e 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,36 @@ To generate the metadata, run the following command:
 pipenv run python3 gen_metadata.py [file_or_dir_name]
 ```
 
+Usage:
+```
+gen_metadata.py [-h] [--device {cpu,cuda}] [--outfname OUTFNAME] [--maskfname MASKFNAME] [--visfname VISFNAME]
+                       file_or_directory [limit]
+```
+
+The `limit` parameter will limit 
+the number of files processed in the directory. The `limit` positional argument is only applicable when passing a directory. 
+
+#### Device Configuration
+By default `gen_metadata.py` requires a GPU (cuda).
+To use a CPU instead pass the `--device cpu` argument to `gen_metadata.py`.
+
+#### Single File Usage
+The following three arguments are only supported when processing a single image file:
+- `--outfname <filename>` - When passed the script will save the output metadata JSON to `<filename>` instead of printing to the console (the default behavior when processing one file).
+- `--maskfname <filename>` - Enables logic to save an output mask to `<filename>` for the single input file.
+- `--visfname <filename>` - Changes the script to save the output visualization to `<filename>` instead of the hard coded location.
+
+These arguments are meant to simplify adding `gen_metadata.py` to a workflow that process files individually.
+
+
+### Running with Singularity
+A Docker container is automatically built for each **drexel_metadata** release. This container has the requirements installed and includes the model file.
+To run the singularity container for a specific version follow this pattern:
+```
+singularity run docker://ghcr.io/hdr-bgnn/drexel_metadata:<release> python gen_metadata.py ...
+```
+
+
 ## Properties Generated
 
 | **Property**            | **Association** | **Type** | **Explanation**                                                                                                                                   |
@@ -135,5 +165,3 @@ pipenv run python3 gen_metadata.py [file_or_dir_name]
 Joel Pepper
 
 Kevin Karnani
-
-Thibault Tabarin

From f8fdcb479f2cdff42f0bbfe10b5a56bf85ba830f Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Wed, 26 Oct 2022 17:07:55 -0400
Subject: [PATCH 21/28] Update Dockerfile after File Path fix

Updates Dockerfile to account for fixed Dataverse File Path for model_final.pth
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b8c9997..b1c86c0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ COPY Pipfile /pipeline/.
 RUN pipenv install --skip-lock --system && pipenv --clear
 
 COPY config /pipeline/config
-COPY --from=model_fetcher /model/cache/torch/hub/checkpoints/model_final.pth \
+COPY --from=model_fetcher /model/model_final.pth \
                           /pipeline/output/enhanced/model_final.pth
 COPY gen_metadata.py /pipeline
 

From 959321962d5d091134405132796d4c81ab86465e Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Thu, 27 Oct 2022 13:02:08 -0400
Subject: [PATCH 22/28] Make reading config files more robust

Looks for config files in config/ in the same directory as
gen_metadata.py. This will allow running gen_metadata.py from
any directory. This simplifies inclusion in a workflow that
utilizes relative paths.

Also adds gen_metadata.py to the PATH in the Dockerfile so
this script can be simply called.

Based on parts of https://github.com/hdr-bgnn/drexel_metadata/pull/6/commits/ea8b023e90006ffa6075d4410f2ecaa6f852beb6

Co-authored-by: thibaulttabarin <tabarin@battelleecology.org>
---
 Dockerfile      |  5 +++++
 gen_metadata.py | 14 ++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 gen_metadata.py

diff --git a/Dockerfile b/Dockerfile
index b1c86c0..04e64d3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,6 +21,10 @@ RUN pip install --upgrade pip
 RUN pip install pipenv
 
 WORKDIR /pipeline
+
+# ADD scripts in /pipeline to the PATH
+ENV PATH="/pipeline:${PATH}"
+
 COPY Pipfile /pipeline/.
 
 # Install requirements
@@ -29,6 +33,7 @@ RUN pipenv install --skip-lock --system && pipenv --clear
 COPY config /pipeline/config
 COPY --from=model_fetcher /model/model_final.pth \
                           /pipeline/output/enhanced/model_final.pth
+
 COPY gen_metadata.py /pipeline
 
 CMD echo "python gen_metadata.py"
diff --git a/gen_metadata.py b/gen_metadata.py
old mode 100644
new mode 100755
index f3530b4..673b169
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import json
 import math
 import os
@@ -24,13 +25,18 @@
 
 # torch.multiprocessing.set_start_method('forkserver')
 
+# Look for the config directory in the same directory as this script
+root_dir_path = os.path.join(os.path.dirname(__file__))
+main_config_path = os.path.join(root_dir_path, 'config', 'config.json')
+mask_config_path = os.path.join(root_dir_path, 'config', 'mask_rcnn_R_50_FPN_3x.yaml')
+
 VAL_SCALE_FAC = 0.5
-conf = json.load(open('config/config.json', 'r'))
+conf = json.load(open(main_config_path, 'r'))
 ENHANCE = bool(conf['ENHANCE'])
 JOEL = bool(conf['JOEL'])
 IOU_PCT = .02
 
-with open('config/mask_rcnn_R_50_FPN_3x.yaml', 'r') as f:
+with open(mask_config_path, 'r') as f:
     iters = yaml.load(f, Loader=yaml.FullLoader)["SOLVER"]["MAX_ITER"]
 
 
@@ -42,12 +48,12 @@ def init_model(enhance_contrast=ENHANCE, joel=JOEL, device=None):
         predictor -- DefaultPredictor(**configs).
     """
     cfg = get_cfg()
-    cfg.merge_from_file("config/mask_rcnn_R_50_FPN_3x.yaml")
+    cfg.merge_from_file(mask_config_path)
     cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
     if not joel:
         cfg.OUTPUT_DIR += f"/non_enhanced" if not enhance_contrast else f"/enhanced"
         # cfg.OUTPUT_DIR += f"/non_enhanced_{iters}" if not enhance_contrast else f"/enhanced_{iters}"
-    cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
+    cfg.MODEL.WEIGHTS = os.path.join(os.path.join(root_dir_path, cfg.OUTPUT_DIR), "model_final.pth")
     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
     if device:
        cfg.MODEL.DEVICE = device

From 8d60a240e9179664b41b3486b1e0fa1c881029cb Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 14 Nov 2022 08:15:02 -0500
Subject: [PATCH 23/28] Update README.md

Co-authored-by: Hilmar Lapp <hlapp@drycafe.net>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dbad82e..61f83f2 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,7 @@ singularity run docker://ghcr.io/hdr-bgnn/drexel_metadata:<release> python gen_m
 
 [Joel Pepper et al.](https://ieeexplore.ieee.org/document/9651834)
 
-[Kevin Karnani et al.] (https://assets.researchsquare.com/files/rs-1506561/v1_covered.pdf?c=1651071974)
+Kevin Karnani, Joel Pepper, Yasin Bakis et al. Computational Metadata Generation Methods for Biological Specimen Image Collections, 27 April 2022, PREPRINT (Version 1) available at Research Square <https://doi.org/10.21203/rs.3.rs-1506561/v1>
 
 ## Authors
 

From 3f20105f12ba34ba720bbee3d959599d1311934e Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 14 Nov 2022 08:16:06 -0500
Subject: [PATCH 24/28] Update README.md

Co-authored-by: Hilmar Lapp <hlapp@drycafe.net>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 61f83f2..a13f949 100644
--- a/README.md
+++ b/README.md
@@ -156,7 +156,7 @@ singularity run docker://ghcr.io/hdr-bgnn/drexel_metadata:<release> python gen_m
 
 ## Associated Publication
 
-[Joel Pepper et al.](https://ieeexplore.ieee.org/document/9651834)
+J. Pepper, J. Greenberg, Y. Bakiş, X. Wang, H. Bart and D. Breen, "Automatic Metadata Generation for Fish Specimen Image Collections," 2021 ACM/IEEE Joint Conference on Digital Libraries (JCDL), 2021, pp. 31-40, doi: [10.1109/JCDL52503.2021.00015](https://doi.org/10.1109/JCDL52503.2021.00015).
 
 Kevin Karnani, Joel Pepper, Yasin Bakis et al. Computational Metadata Generation Methods for Biological Specimen Image Collections, 27 April 2022, PREPRINT (Version 1) available at Research Square <https://doi.org/10.21203/rs.3.rs-1506561/v1>
 

From 5ecc1b5213081205a428e4ebf6054c4ff9abfa07 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 14 Nov 2022 08:16:31 -0500
Subject: [PATCH 25/28] Update gen_metadata.py

Co-authored-by: Hilmar Lapp <hlapp@drycafe.net>
---
 gen_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index 673b169..f250792 100755
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -939,7 +939,7 @@ def argument_parser():
     parser = argparse.ArgumentParser(description='Generate metadata for one or more fish images.')
     parser.add_argument('file_or_directory',
                         help='Path to a fish image or a directory of multiple fish images. '
-                             'When one file is passed the JSON metadata is printed to the terminal.')
+                             'When one file is passed the JSON metadata is printed to the terminal (except see --outfname).')
     parser.add_argument('limit', type=int, nargs='?',
                         help='Limit the number of images processed from a directory')
     parser.add_argument('--outfname',

From 7794de3f3426017a8d47047b0b105da14d346c4c Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 14 Nov 2022 08:16:50 -0500
Subject: [PATCH 26/28] Update gen_metadata.py

Co-authored-by: Hilmar Lapp <hlapp@drycafe.net>
---
 gen_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gen_metadata.py b/gen_metadata.py
index f250792..6f95460 100755
--- a/gen_metadata.py
+++ b/gen_metadata.py
@@ -943,7 +943,7 @@ def argument_parser():
     parser.add_argument('limit', type=int, nargs='?',
                         help='Limit the number of images processed from a directory')
     parser.add_argument('--outfname',
-                        help='Output filename to use for JSON metadata (disables printing to terminal).')
+                        help='Output filename to which to print JSON metadata (instead of terminal).')
     parser.add_argument('--device', choices=['cpu', 'cuda'], default=None,
                         help='Override the default device used for the ML model.')
     parser.add_argument('--maskfname',

From c12054d39721ba78771d236fb176d8370573cb75 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Mon, 14 Nov 2022 08:25:10 -0500
Subject: [PATCH 27/28] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a13f949..ebcda88 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ These arguments are meant to simplify adding `gen_metadata.py` to a workflow tha
 A Docker container is automatically built for each **drexel_metadata** release. This container has the requirements installed and includes the model file.
 To run the singularity container for a specific version follow this pattern:
 ```
-singularity run docker://ghcr.io/hdr-bgnn/drexel_metadata:<release> python gen_metadata.py ...
+singularity run docker://ghcr.io/hdr-bgnn/drexel_metadata:<release> gen_metadata.py ...
 ```
 
 

From 7caa3c69366071c3edd7f139eb4ddef15c2ed819 Mon Sep 17 00:00:00 2001
From: John Bradley <johnbradley2008@gmail.com>
Date: Tue, 15 Nov 2022 10:21:16 -0500
Subject: [PATCH 28/28] Use stable dataverse-access container

Uses the command line stable dva container version.
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 04e64d3..abd3653 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ghcr.io/imageomics/dataverse-access:0.0.3 as model_fetcher
+FROM ghcr.io/imageomics/dataverse-access:1 as model_fetcher
 ARG DATAVERSE_API_TOKEN
 ENV DATAVERSE_URL=https://datacommons.tdai.osu.edu/
 ENV MODEL_DV_DOI=doi:10.5072/FK2/MMX6FY