Skip to content

Commit

Permalink
cleanup and addition of fcis model code
Browse files Browse the repository at this point in the history
  • Loading branch information
rbavery committed May 29, 2020
1 parent 470d3e8 commit 4ffbf1c
Show file tree
Hide file tree
Showing 46 changed files with 26,250 additions and 405 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
notebooks/dask-worker-space
app/LocalForwarder.config
notebooks/test_metric
images/*.tif
Expand All @@ -6,7 +7,9 @@ src/ # this is built from installing packages from github with conda
azure_configs.yaml
notebooks/test_metric
postgisdb/

fcis/InstanceSegmentation_Sentinel2/model/resnet_v1_101_coco_fcis_end2end_ohem/*.params
fcis/InstanceSegmentation_Sentinel2/model/resnet_v1_101_coco_fcis_end2end_ohem/*.json
fcis/InstanceSegmentation_Sentinel2/model/*.params
# Data files and directories common in repo root
data/
datasets/
Expand Down
26 changes: 26 additions & 0 deletions app/app-env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: ai4e_py_api
channels:
- conda-forge
dependencies:
# Core scientific python
- numpy
- matplotlib
- pyqt
- scipy
- scikit-image
- poppler<=0.61
# Mask_RCNN requirements
- Pillow
- cython
- matplotlib
- scikit-image
- tensorflow-gpu>=1.3.0
- keras-gpu>=2.0.8
- opencv
- h5py
- setuptools

- pip:
# Utilities
- imgaug # for mrcnn
- rasterio # for reading img, conda can't install without libpoppler issue
33 changes: 16 additions & 17 deletions base_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ INPUT:
TYPE: relative_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 0
MAX_SIZE_TRAIN: 512
MIN_SIZE_TEST: 0
MIN_SIZE_TRAIN:
- 640
- 672
Expand Down Expand Up @@ -86,9 +86,9 @@ MODEL:
- 1001.46930161
- 2793.30379383
PIXEL_STD:
- 262.82447442
- 340.61644907
- 559.11205354
- 1.0
- 1.0
- 1.0
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
Expand Down Expand Up @@ -161,10 +161,10 @@ MODEL:
- 0.7
ROI_BOX_HEAD:
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
- 0
- 0
- 0
- 0
CLS_AGNOSTIC_BBOX_REG: false
CONV_DIM: 256
FC_DIM: 1024
Expand Down Expand Up @@ -266,19 +266,18 @@ MODEL:
NORM: GN
NUM_CLASSES: 54
WEIGHTS: https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
OUTPUT_DIR: /datadrive/cropmask_experiments/nirrg-double-roi_heads/
OUTPUT_DIR: /datadrive/cropmask_experiments/nirrg-nms7/
SEED: -1
SOLVER:
BASE_LR: 0.0003
BASE_LR: 0.0005
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 800
CHECKPOINT_PERIOD: 200
GAMMA: 0.1
IMS_PER_BATCH: 16
IMS_PER_BATCH: 8
LR_SCHEDULER_NAME: WarmupMultiStepLR
MAX_ITER: 4000
MAX_ITER: 3000
MOMENTUM: 0.9
STEPS:
- 650
STEPS: []
WARMUP_FACTOR: 0.001
WARMUP_ITERS: 1000
WARMUP_METHOD: linear
Expand Down
6 changes: 0 additions & 6 deletions bash-scripts/push_container.sh

This file was deleted.

2 changes: 0 additions & 2 deletions bash-scripts/shp_to_geojson.sh

This file was deleted.

117 changes: 92 additions & 25 deletions cropmask/coco_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import fnmatch
from PIL import Image
from pycococreatortools import pycococreatortools
import cropmask.misc as misc
from shutil import copyfile
import pandas as pd
import solaris as sol
from solaris.data.coco import geojson2coco
from sklearn.model_selection import train_test_split
from detectron2.data.datasets import register_coco_instances, load_coco_json

Expand Down Expand Up @@ -41,8 +40,56 @@ def create_coco_meta():
},
]
return INFO_DICT, LICENSE_DICT, PRESET_CATEGORIES

def split_save_train_validation_test_df(tiles_path, validation_size = .1, test_size = .1, random_state = 1, save_empty_tiles = False):

def get_tb_substring(fname):
fname = str(fname)
return re.search(r'.+_(-*\d+_\d+)\..+', fname).group(1)

def get_date_substring(fname):
fname = str(fname)
return re.search(r'.+_(\d+_\d+)_C.+', fname).group(1)

def match_by_tb_date(jpeg_tiles, image_tiles, label_tiles, geojson_tiles):
"""
Faster way that doesn't work for some reason. ids look unique but lists aren't sorted the same
def get_tb_substring2(fname):
fname = str(fname)
return re.search(r'.+_-*(\d+_\d+)\..+', fname).group(1)
def get_date_substring2(fname):
fname = str(fname)
return re.search(r'.+_(\d+_\d+)_.+', fname).group(1)
def get_uid(x):
tb1, tb2 = get_tb_substring2(x).split("_")
d1, d2 = get_date_substring2(x).split("_")
return int(tb1 +tb2 + d1 + d2)
sg = sorted(geojson_lst, key=lambda x: get_uid(x)) # sort doesn't work for some reason
sl = sorted(label_lst, key=lambda x: get_uid(x))
"""
print("total list length: {len(tile_df)}")
match_lst = []
count = 0
for i in label_tiles:
tb = get_tb_substring(i)
date = get_date_substring(i)
for j in geojson_tiles:
if tb in j and date in j:
break
for k in image_tiles:
if tb in k and date in k:
break
for z in jpeg_tiles:
if tb in z and date in z:
break
match_lst.append([z,k,i,j])
count +=1
if count % 1000 == 0:
print(count)
return match_lst

def make_train_validation_test_df(tiles_path, save_empty_tiles = False):
"""
Takes a Path to tiles and randomly splits image and labels into train and test sets.
The test split occurs first. After, the set that is not train is randomly split into
Expand All @@ -57,33 +104,50 @@ def split_save_train_validation_test_df(tiles_path, validation_size = .1, test_s
jpeg_tiles_path = tiles_path / "jpeg_tiles"
label_tiles_path = tiles_path / "label_tiles"
geojson_tiles_path = tiles_path / "geojson_tiles"
label_tiles = list(label_tiles_path.glob("*"))
image_tiles = list(image_tiles_path.glob("*"))
geojson_tiles = list(geojson_tiles_path.glob("*"))
jpeg_tiles = list(jpeg_tiles_path.glob("*"))
label_tiles = list(label_tiles_path.glob("*.tif"))
image_tiles = list(image_tiles_path.glob("*.tif"))
geojson_tiles = list(geojson_tiles_path.glob("*.geojson"))
jpeg_tiles = list(jpeg_tiles_path.glob("*.jpg"))
assert len(label_tiles) > 0
assert len(jpeg_tiles) > 0
assert len(image_tiles) > 0
# build tuples of label and im paths
sorted_image_tiles = sorted(image_tiles, key=lambda x: str(x)[-19:])
sorted_jpeg_tiles = sorted(jpeg_tiles, key=lambda x: str(x)[-19:])
sorted_label_tiles = sorted(label_tiles, key=lambda x: str(x)[-19:])
sorted_geojson_tiles = sorted(geojson_tiles, key=lambda x: str(x)[-23:])
sorted_image_tiles = [str(i) for i in sorted(image_tiles, key=lambda x: str(x)[-19:])]
sorted_jpeg_tiles = [str(i) for i in sorted(jpeg_tiles, key=lambda x: str(x)[-19:])]
sorted_label_tiles = [str(i) for i in sorted(label_tiles, key=lambda x: str(x)[-19:])]
sorted_geojson_tiles = [str(i) for i in sorted(geojson_tiles, key=lambda x: str(x)[-23:])]

all_tiles_df = pd.DataFrame(list(zip(sorted_jpeg_tiles, sorted_image_tiles, sorted_label_tiles, sorted_geojson_tiles)), columns = ["jpeg_tiles", "image_tiles", "label_tiles", "geojson_tiles"])
print("sort keys, these should follow same format and match")
print(str(sorted_image_tiles[-1])[-19:])
print(str(sorted_jpeg_tiles[-1])[-19:])
print(str(sorted_label_tiles[-1])[-19:])
print(str(sorted_geojson_tiles[-1])[-23:])
match_lst = match_by_tb_date(sorted_jpeg_tiles, sorted_image_tiles, sorted_label_tiles, sorted_geojson_tiles)
all_tiles_df = pd.DataFrame(match_lst, columns = ["jpeg_tiles", "image_tiles", "label_tiles", "geojson_tiles"])

all_tiles_df['is_empty'] = all_tiles_df.loc[:,'label_tiles'].apply(str).str.contains("empty", regex=False)

if save_empty_tiles is not True:
all_tiles_df = all_tiles_df[all_tiles_df['is_empty']==False]
all_tiles_df['tile_bounds'] = all_tiles_df['label_tiles'].apply(lambda x: get_tb_substring(x))
all_tiles_df['date'] = all_tiles_df['label_tiles'].apply(lambda x: get_date_substring(x))

not_test, test = train_test_split(all_tiles_df, test_size=test_size, random_state = 1)
train, validation = train_test_split(not_test, test_size=validation_size, random_state = 1)
return all_tiles_df

train.to_csv(tiles_path.parent / "coco" / "train.csv")
validation.to_csv(tiles_path.parent / "coco" / "validation.csv")
test.to_csv(tiles_path.parent / "coco" / "test.csv")
return train, validation, test

def split_save_train_validation_test_df(all_tiles_df, validation_size = .15, test_size = .05):
not_test, test = train_test_split(np.unique(all_tiles_df['tile_bounds']), test_size=test_size, random_state=1)
m = all_tiles_df.tile_bounds.isin(test)
testdf = all_tiles_df[m]
train, validation = train_test_split(not_test, test_size=validation_size, random_state = 1)
m = all_tiles_df.tile_bounds.isin(train)
traindf = all_tiles_df[m]
vdf = all_tiles_df[~m]
traindf.to_csv(tiles_path.parent / "coco" / "train.csv")
validationdf.to_csv(tiles_path.parent / "coco" / "validation.csv")
testdf.to_csv(tiles_path.parent / "coco" / "test.csv")
return traindf, vdf, testdf


def create_coco_dataset(df):
"""
Expand All @@ -93,7 +157,7 @@ def create_coco_dataset(df):
geojson_lst = df['geojson_tiles'].to_list()
info, license, preset_categories = create_coco_meta() # preset cats unused for now, unsure how to properly work this with detectron
# crazy regex is based on appending scene ID to each tile, including path/row and date info
coco_dict = sol.data.coco.geojson2coco(image_src = [str(i) for i in img_lst],
coco_dict = geojson2coco(image_src = [str(i) for i in img_lst],
label_src = [str(i) for i in geojson_lst],
matching_re=r'(\d{6}_\d{8}_\d{8}_C\d{2}_V\d_-?\d+_\d+)',
remove_all_multipolygons = True,
Expand All @@ -118,13 +182,16 @@ def dataset_to_coco(dataset_path, img_type, experiment_dir=False):
"""
Experiment dir only required for detectron2 dataset workflow.
For denmark dataset workflow, experiment dir is created seperately since it has it's own folder structure. And requires jpeg instead of tif.
Randomly splits the dataset into train, validation, test by unique tile geographies.
"""
tiles_path = Path(dataset_path) / "tiles"
train, validation, test = split_save_train_validation_test_df(tiles_path, save_empty_tiles=False)
tile_df = make_train_validation_test_df(tiles_path = tiles_path, save_empty_tiles = False)
train, validation, test = split_save_train_validation_test_df(tile_df)
coco_path = Path(dataset_path) / "coco"
train_coco_instances_path = str(coco_path / "instances_train.json")
val_coco_instances_path = str(coco_path / "instances_val.json")
test_coco_instances_path = str(coco_path / "instances_test.json")
train_coco_instances_path = str(coco_path / "det_instances_train.json")
val_coco_instances_path = str(coco_path / "det_instances_val.json")
test_coco_instances_path = str(coco_path / "det_instances_test.json")
if (coco_path / "instances_train.json").exists() is False:
train_coco_dict = create_coco_dataset(train)
val_coco_dict = create_coco_dataset(validation)
Expand All @@ -151,4 +218,4 @@ def dataset_to_coco(dataset_path, img_type, experiment_dir=False):
os.makedirs(experiment_dir, exist_ok=False)
except:
pass
return train_coco_instances_path, val_coco_instances_path, test_coco_instances_path
return (train_coco_instances_path, train), (val_coco_instances_path, validation), (test_coco_instances_path, test)
Loading

0 comments on commit 4ffbf1c

Please sign in to comment.