cleanup and addition of fcis model code

ecohydro · May 29, 2020 · 4ffbf1c · 4ffbf1c
1 parent 470d3e8
commit 4ffbf1c
Show file tree

Hide file tree

Showing 46 changed files with 26,250 additions and 405 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+notebooks/dask-worker-space  
 app/LocalForwarder.config
 notebooks/test_metric
 images/*.tif
@@ -6,7 +7,9 @@ src/ # this is built from installing packages from github with conda
 azure_configs.yaml
 notebooks/test_metric
 postgisdb/
-
+fcis/InstanceSegmentation_Sentinel2/model/resnet_v1_101_coco_fcis_end2end_ohem/*.params
+fcis/InstanceSegmentation_Sentinel2/model/resnet_v1_101_coco_fcis_end2end_ohem/*.json
+fcis/InstanceSegmentation_Sentinel2/model/*.params
 # Data files and directories common in repo root
 data/
 datasets/

diff --git a/app/app-env.yml b/app/app-env.yml
@@ -0,0 +1,26 @@
+name: ai4e_py_api
+channels:
+  - conda-forge
+dependencies:
+  # Core scientific python
+  - numpy
+  - matplotlib
+  - pyqt
+  - scipy
+  - scikit-image
+  - poppler<=0.61
+# Mask_RCNN requirements
+  - Pillow
+  - cython
+  - matplotlib
+  - scikit-image
+  - tensorflow-gpu>=1.3.0
+  - keras-gpu>=2.0.8
+  - opencv
+  - h5py
+  - setuptools
+
+  - pip:
+      # Utilities
+      - imgaug # for mrcnn
+      - rasterio # for reading img, conda can't install without libpoppler issue
diff --git a/base_config.yaml b/base_config.yaml
@@ -29,9 +29,9 @@ INPUT:
     TYPE: relative_range
   FORMAT: RGB
   MASK_FORMAT: polygon
-  MAX_SIZE_TEST: 1333
-  MAX_SIZE_TRAIN: 1333
-  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 0
+  MAX_SIZE_TRAIN: 512
+  MIN_SIZE_TEST: 0
   MIN_SIZE_TRAIN:
   - 640
   - 672
@@ -86,9 +86,9 @@ MODEL:
   - 1001.46930161
   - 2793.30379383
   PIXEL_STD:
-  - 262.82447442
-  - 340.61644907
-  - 559.11205354
+  - 1.0
+  - 1.0
+  - 1.0
   PROPOSAL_GENERATOR:
     MIN_SIZE: 0
     NAME: RPN
@@ -161,10 +161,10 @@ MODEL:
     - 0.7
   ROI_BOX_HEAD:
     BBOX_REG_WEIGHTS:
-    - 10.0
-    - 10.0
-    - 5.0
-    - 5.0
+    - 0
+    - 0
+    - 0
+    - 0
     CLS_AGNOSTIC_BBOX_REG: false
     CONV_DIM: 256
     FC_DIM: 1024
@@ -266,19 +266,18 @@ MODEL:
     NORM: GN
     NUM_CLASSES: 54
   WEIGHTS: https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
-OUTPUT_DIR: /datadrive/cropmask_experiments/nirrg-double-roi_heads/
+OUTPUT_DIR: /datadrive/cropmask_experiments/nirrg-nms7/
 SEED: -1
 SOLVER:
-  BASE_LR: 0.0003
+  BASE_LR: 0.0005
   BIAS_LR_FACTOR: 1.0
-  CHECKPOINT_PERIOD: 800
+  CHECKPOINT_PERIOD: 200
   GAMMA: 0.1
-  IMS_PER_BATCH: 16
+  IMS_PER_BATCH: 8
   LR_SCHEDULER_NAME: WarmupMultiStepLR
-  MAX_ITER: 4000
+  MAX_ITER: 3000
   MOMENTUM: 0.9
-  STEPS:
-  - 650
+  STEPS: []
   WARMUP_FACTOR: 0.001
   WARMUP_ITERS: 1000
   WARMUP_METHOD: linear

diff --git a/bash-scripts/push_container.sh b/bash-scripts/push_container.sh
diff --git a/bash-scripts/shp_to_geojson.sh b/bash-scripts/shp_to_geojson.sh
diff --git a/cropmask/coco_convert.py b/cropmask/coco_convert.py
@@ -8,10 +8,9 @@
 import fnmatch
 from PIL import Image
 from pycococreatortools import pycococreatortools
-import cropmask.misc as misc
 from shutil import copyfile
 import pandas as pd
-import solaris as sol
+from solaris.data.coco import geojson2coco
 from sklearn.model_selection import train_test_split
 from detectron2.data.datasets import register_coco_instances, load_coco_json
 
@@ -41,8 +40,56 @@ def create_coco_meta():
         },
     ]
     return INFO_DICT, LICENSE_DICT, PRESET_CATEGORIES
-
-def split_save_train_validation_test_df(tiles_path, validation_size = .1, test_size = .1, random_state = 1, save_empty_tiles = False):
+
+def get_tb_substring(fname):
+    fname = str(fname)
+    return re.search(r'.+_(-*\d+_\d+)\..+', fname).group(1)
+
+def get_date_substring(fname):
+    fname = str(fname)
+    return re.search(r'.+_(\d+_\d+)_C.+', fname).group(1)
+
+def match_by_tb_date(jpeg_tiles, image_tiles, label_tiles, geojson_tiles):
+    """
+    Faster way that doesn't work for some reason. ids look unique but lists aren't sorted the same
+    
+    def get_tb_substring2(fname):
+        fname = str(fname)
+        return re.search(r'.+_-*(\d+_\d+)\..+', fname).group(1)
+    def get_date_substring2(fname):
+        fname = str(fname)
+        return re.search(r'.+_(\d+_\d+)_.+', fname).group(1)
+
+    def get_uid(x):
+        tb1, tb2 = get_tb_substring2(x).split("_")
+        d1, d2 = get_date_substring2(x).split("_")
+        return int(tb1 +tb2 + d1 + d2)
+
+    sg = sorted(geojson_lst, key=lambda x: get_uid(x)) # sort doesn't work for some reason
+    sl = sorted(label_lst, key=lambda x: get_uid(x))
+    """
+    print("total list length: {len(tile_df)}")
+    match_lst = []
+    count = 0
+    for i in label_tiles:
+        tb = get_tb_substring(i)
+        date = get_date_substring(i)
+        for j in geojson_tiles:
+            if tb in j and date in j:
+                break
+        for k in image_tiles:
+            if tb in k and date in k:
+                break
+        for z in jpeg_tiles:
+            if tb in z and date in z:
+                break
+        match_lst.append([z,k,i,j])
+        count +=1
+        if count % 1000 == 0:
+            print(count)
+    return match_lst
+
+def make_train_validation_test_df(tiles_path, save_empty_tiles = False):
     """
     Takes a Path to tiles and randomly splits image and labels into train and test sets.
     The test split occurs first. After, the set that is not train is randomly split into 
@@ -57,33 +104,50 @@ def split_save_train_validation_test_df(tiles_path, validation_size = .1, test_s
     jpeg_tiles_path = tiles_path / "jpeg_tiles"
     label_tiles_path = tiles_path / "label_tiles"
     geojson_tiles_path = tiles_path / "geojson_tiles"
-    label_tiles = list(label_tiles_path.glob("*"))
-    image_tiles = list(image_tiles_path.glob("*"))
-    geojson_tiles = list(geojson_tiles_path.glob("*"))
-    jpeg_tiles = list(jpeg_tiles_path.glob("*"))
+    label_tiles = list(label_tiles_path.glob("*.tif"))
+    image_tiles = list(image_tiles_path.glob("*.tif"))
+    geojson_tiles = list(geojson_tiles_path.glob("*.geojson"))
+    jpeg_tiles = list(jpeg_tiles_path.glob("*.jpg"))
     assert len(label_tiles) > 0
     assert len(jpeg_tiles) > 0
     assert len(image_tiles) > 0
     # build tuples of label and im paths
-    sorted_image_tiles = sorted(image_tiles, key=lambda x: str(x)[-19:])
-    sorted_jpeg_tiles = sorted(jpeg_tiles, key=lambda x: str(x)[-19:])
-    sorted_label_tiles = sorted(label_tiles, key=lambda x: str(x)[-19:])
-    sorted_geojson_tiles = sorted(geojson_tiles, key=lambda x: str(x)[-23:])
+    sorted_image_tiles = [str(i) for i in sorted(image_tiles, key=lambda x: str(x)[-19:])]
+    sorted_jpeg_tiles = [str(i) for i in sorted(jpeg_tiles, key=lambda x: str(x)[-19:])]
+    sorted_label_tiles = [str(i) for i in sorted(label_tiles, key=lambda x: str(x)[-19:])]
+    sorted_geojson_tiles = [str(i) for i in sorted(geojson_tiles, key=lambda x: str(x)[-23:])]
 
-    all_tiles_df = pd.DataFrame(list(zip(sorted_jpeg_tiles, sorted_image_tiles, sorted_label_tiles, sorted_geojson_tiles)), columns = ["jpeg_tiles", "image_tiles", "label_tiles", "geojson_tiles"])
+    print("sort keys, these should follow same format and match")
+    print(str(sorted_image_tiles[-1])[-19:])
+    print(str(sorted_jpeg_tiles[-1])[-19:])
+    print(str(sorted_label_tiles[-1])[-19:])
+    print(str(sorted_geojson_tiles[-1])[-23:])
+    match_lst = match_by_tb_date(sorted_jpeg_tiles, sorted_image_tiles, sorted_label_tiles, sorted_geojson_tiles)
+    all_tiles_df = pd.DataFrame(match_lst, columns = ["jpeg_tiles", "image_tiles", "label_tiles", "geojson_tiles"])
 
     all_tiles_df['is_empty'] = all_tiles_df.loc[:,'label_tiles'].apply(str).str.contains("empty", regex=False)
 
     if save_empty_tiles is not True:
         all_tiles_df = all_tiles_df[all_tiles_df['is_empty']==False]
+    all_tiles_df['tile_bounds'] = all_tiles_df['label_tiles'].apply(lambda x: get_tb_substring(x))
+    all_tiles_df['date'] = all_tiles_df['label_tiles'].apply(lambda x: get_date_substring(x))
 
-    not_test, test = train_test_split(all_tiles_df, test_size=test_size, random_state = 1)
-    train, validation = train_test_split(not_test, test_size=validation_size, random_state = 1)
+    return all_tiles_df
 
-    train.to_csv(tiles_path.parent / "coco" / "train.csv")
-    validation.to_csv(tiles_path.parent / "coco" / "validation.csv")
-    test.to_csv(tiles_path.parent / "coco" / "test.csv")
-    return train, validation, test
+
+def split_save_train_validation_test_df(all_tiles_df, validation_size = .15, test_size = .05):
+    not_test, test = train_test_split(np.unique(all_tiles_df['tile_bounds']), test_size=test_size, random_state=1)
+    m = all_tiles_df.tile_bounds.isin(test)
+    testdf = all_tiles_df[m]
+    train, validation = train_test_split(not_test, test_size=validation_size, random_state = 1)
+    m = all_tiles_df.tile_bounds.isin(train)
+    traindf = all_tiles_df[m]
+    vdf = all_tiles_df[~m]
+    traindf.to_csv(tiles_path.parent / "coco" / "train.csv")
+    validationdf.to_csv(tiles_path.parent / "coco" / "validation.csv")
+    testdf.to_csv(tiles_path.parent / "coco" / "test.csv")
+    return traindf, vdf, testdf
+
 
 def create_coco_dataset(df):
     """
@@ -93,7 +157,7 @@ def create_coco_dataset(df):
     geojson_lst = df['geojson_tiles'].to_list()
     info, license, preset_categories = create_coco_meta() # preset cats unused for now, unsure how to properly work this with detectron
     # crazy regex is based on appending scene ID to each tile, including path/row and date info
-    coco_dict = sol.data.coco.geojson2coco(image_src = [str(i) for i in img_lst],
+    coco_dict = geojson2coco(image_src = [str(i) for i in img_lst],
                                        label_src = [str(i) for i in geojson_lst],
                                        matching_re=r'(\d{6}_\d{8}_\d{8}_C\d{2}_V\d_-?\d+_\d+)',
                                        remove_all_multipolygons = True,
@@ -118,13 +182,16 @@ def dataset_to_coco(dataset_path, img_type, experiment_dir=False):
     """
     Experiment dir only required for detectron2 dataset workflow. 
     For denmark dataset workflow, experiment dir is created seperately since it has it's own folder structure. And requires jpeg instead of tif.
+    
+    Randomly splits the dataset into train, validation, test by unique tile geographies.
     """
     tiles_path = Path(dataset_path) / "tiles"
-    train, validation, test = split_save_train_validation_test_df(tiles_path, save_empty_tiles=False)
+    tile_df =  make_train_validation_test_df(tiles_path = tiles_path, save_empty_tiles = False)
+    train, validation, test = split_save_train_validation_test_df(tile_df)
     coco_path = Path(dataset_path) / "coco"
-    train_coco_instances_path = str(coco_path / "instances_train.json")
-    val_coco_instances_path = str(coco_path / "instances_val.json")
-    test_coco_instances_path = str(coco_path / "instances_test.json")
+    train_coco_instances_path = str(coco_path / "det_instances_train.json")
+    val_coco_instances_path = str(coco_path / "det_instances_val.json")
+    test_coco_instances_path = str(coco_path / "det_instances_test.json")
     if (coco_path / "instances_train.json").exists() is False:
         train_coco_dict = create_coco_dataset(train)
         val_coco_dict = create_coco_dataset(validation)
@@ -151,4 +218,4 @@ def dataset_to_coco(dataset_path, img_type, experiment_dir=False):
         os.makedirs(experiment_dir, exist_ok=False)
     except:
         pass
-    return train_coco_instances_path, val_coco_instances_path, test_coco_instances_path
+    return (train_coco_instances_path, train), (val_coco_instances_path, validation), (test_coco_instances_path, test)