diff --git a/enas/cifar10/block_stacking_reader.py b/enas/cifar10/block_stacking_reader.py
new file mode 100644
index 0000000..738ba71
--- /dev/null
+++ b/enas/cifar10/block_stacking_reader.py
@@ -0,0 +1,815 @@
+
+import h5py
+import os
+import io
+import sys
+import glob
+import traceback
+from PIL import Image
+from skimage.transform import resize
+
+import numpy as np
+from numpy.random import RandomState
+import json
+import keras
+from keras.utils import Sequence
+from keras.utils import OrderedEnqueuer
+import tensorflow as tf
+import grasp_metrics
+import keras_applications
+import keras_preprocessing
+
+
+def random_eraser(input_img, p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=1/0.3, v_l=0, v_h=255, pixel_level=True):
+    """ Cutout and random erasing algorithms for data augmentation
+
+    source:
+    https://github.com/yu4u/cutout-random-erasing/blob/master/random_eraser.py
+    """
+    img_h, img_w, img_c = input_img.shape
+    p_1 = np.random.rand()
+
+    if p_1 > p:
+        return input_img
+
+    while True:
+        s = np.random.uniform(s_l, s_h) * img_h * img_w
+        r = np.random.uniform(r_1, r_2)
+        w = int(np.sqrt(s / r))
+        h = int(np.sqrt(s * r))
+        left = np.random.randint(0, img_w)
+        top = np.random.randint(0, img_h)
+
+        if left + w <= img_w and top + h <= img_h:
+            break
+
+    if pixel_level:
+        c = np.random.uniform(v_l, v_h, (h, w, img_c))
+    else:
+        c = np.random.uniform(v_l, v_h)
+
+    input_img[top:top + h, left:left + w, :] = c
+
+    return input_img
+
+
+def tile_vector_as_image_channels_np(vector_op, image_shape):
+    """
+    Takes a vector of length n and an image shape BHWC,
+    and repeat the vector as channels at each pixel.
+
+    # Params
+
+      vector_op: A tensor vector to tile.
+      image_shape: A list of integers [width, height] with the desired dimensions.
+    """
+    # input vector shape
+    ivs = np.shape(vector_op)
+    # reshape the vector into a single pixel
+    vector_pixel_shape = [ivs[0], 1, 1, ivs[1]]
+    vector_op = np.reshape(vector_op, vector_pixel_shape)
+    # tile the pixel into a full image
+    tile_dimensions = [1, image_shape[1], image_shape[2], 1]
+    vector_op = np.tile(vector_op, tile_dimensions)
+    # if K.backend() is 'tensorflow':
+    #     output_shape = [ivs[0], image_shape[1], image_shape[2], ivs[1]]
+    #     vector_op.set_shape(output_shape)
+    return vector_op
+
+
+def concat_images_with_tiled_vector_np(images, vector):
+    """Combine a set of images with a vector, tiling the vector at each pixel in the images and concatenating on the channel axis.
+
+    # Params
+
+        images: list of images with the same dimensions
+        vector: vector to tile on each image. If you have
+            more than one vector, simply concatenate them
+            all before calling this function.
+
+    # Returns
+
+    """
+    if not isinstance(images, list):
+        images = [images]
+    image_shape = np.shape(images[0])
+    tiled_vector = tile_vector_as_image_channels_np(vector, image_shape)
+    images.append(tiled_vector)
+    combined = np.concatenate(images, axis=-1)
+
+    return combined
+
+
+def concat_unit_meshgrid_np(tensor):
+    """ Concat unit meshgrid onto the tensor.
+
+    This is roughly equivalent to the input in uber's coordconv.
+    TODO(ahundt) concat_unit_meshgrid_np is untested.
+    """
+    assert len(tensor.shape) == 4
+    # print('tensor shape: ' + str(tensor.shape))
+    y_size = tensor.shape[1]
+    x_size = tensor.shape[2]
+    max_value = max(x_size, y_size)
+    y, x = np.meshgrid(np.arange(y_size),
+                       np.arange(x_size),
+                       indexing='ij')
+    assert y.size == x.size and y.size == tensor.shape[1] * tensor.shape[2]
+    # print('x shape: ' + str(x.shape) + ' y shape: ' + str(y.shape))
+    # rescale data and reshape to have the same dimension as the tensor
+    y = np.reshape(y / max_value, [1, y.shape[0], y.shape[1], 1])
+    x = np.reshape(x / max_value, [1, x.shape[0], x.shape[1], 1])
+
+    # need to have a meshgrid for each example in the batch,
+    # so tile along batch axis
+    tile_dimensions = [tensor.shape[0], 1, 1, 1]
+    y = np.tile(y, tile_dimensions)
+    x = np.tile(x, tile_dimensions)
+    combined = np.concatenate([tensor, y, x], axis=-1)
+    return combined
+
+
+def blend_images_np(image, image2, alpha=0.5):
+    """Draws image2 on an image.
+    Args:
+      image: uint8 numpy array with shape (img_height, img_height, 3)
+      image2: a uint8 numpy array of shape (img_height, img_height) with
+        values between either 0 or 1.
+      color: color to draw the keypoints with. Default is red.
+      alpha: transparency value between 0 and 1. (default: 0.4)
+    Raises:
+      ValueError: On incorrect data type for image or image2s.
+    """
+    if image.dtype != np.uint8:
+        raise ValueError('`image` not of type np.uint8')
+    if image2.dtype != np.uint8:
+        raise ValueError('`image2` not of type np.uint8')
+    if image.shape[:2] != image2.shape[:2]:
+        raise ValueError('The image has spatial dimensions %s but the image2 has '
+                         'dimensions %s' % (image.shape[:2], image2.shape[:2]))
+    pil_image = Image.fromarray(image)
+    pil_image2 = Image.fromarray(image2)
+
+    pil_image = Image.blend(pil_image, pil_image2, alpha)
+    np.copyto(image, np.array(pil_image.convert('RGB')))
+    return image
+
+
+def blend_image_sequence(images, alpha=0.5, verbose=0):
+    """ Blend past goal images
+    """
+    blended_image = images[0]
+    if len(images) > 1:
+        for image in images[1:]:
+            if verbose > 1:
+                print('image type: ' + str(type(image)) + ' dtype: ' + str(image.dtype))
+            blended_image = blend_images_np(blended_image, image)
+    return blended_image
+
+
+def get_past_goal_indices(current_robot_time_index, goal_indices, filename='', verbose=0):
+    """ get past goal image indices, including the initial image
+
+    # Arguments
+
+    current_robot_time_index: the index of the current "robot time" being simulated
+    goal_indices: a list of goal time indices for every robot time
+
+    # Returns
+
+    A list of indices representing all the goal time steps
+    """
+    image_indices = [0]
+    total_goal_indices = len(goal_indices)
+    if verbose:
+        print('total images: ' + str(total_goal_indices))
+    image_index = 1
+    while image_index < current_robot_time_index and image_index < total_goal_indices:
+        if verbose > 0:
+            print('image_index: ' + str(image_index))
+        goal_image_index = goal_indices[image_index]
+        if goal_image_index < current_robot_time_index and goal_image_index < total_goal_indices:
+            if verbose > 0:
+                print('goal_indices[goal_image_index]: ' + str(goal_indices[goal_image_index]))
+            image_indices += [goal_image_index]
+            if goal_image_index <= goal_indices[goal_image_index]:
+                image_index += 1
+        # TODO(ahundt) understand the cause of the warning below, modify the preprocessing script to correct it
+        elif goal_image_index >= total_goal_indices and verbose > 0:
+            print('block_stacking_reader.py::get_past_goal_indices(): warning, goal index equals '
+                  'or exceeds total_goal_indices. filename: ' + str(filename) +
+                  ' goal_image_index: ' + str(goal_image_index) +
+                  ' total_goal_indices: ' + str(total_goal_indices))
+        image_index = goal_image_index
+    return image_indices
+
+
+def encode_label(label_features_to_extract, y, action_successes=None, random_augmentation=None, current_stacking_reward=None):
+    """ Encode a label based on the features that need to be extracted from the pose y.
+
+    y: list of poses in [[x, y, z, qx, qy, qz, qw]] format
+    action_successes: list of labels with successful actions
+    """
+    # determine the label
+    if label_features_to_extract is None or 'grasp_goal_xyz_3' in label_features_to_extract:
+        # regression to translation case, see semantic_translation_regression in cornell_grasp_train.py
+        y = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(y, random_augmentation=random_augmentation)
+        y = y[:, :3]
+    elif label_features_to_extract is None or 'grasp_goal_aaxyz_nsc_5' in label_features_to_extract:
+        # regression to rotation case, see semantic_rotation_regression in cornell_grasp_train.py
+        y = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(y, random_augmentation=random_augmentation)
+        y = y[:, 3:]
+    elif label_features_to_extract is None or 'grasp_goal_xyz_aaxyz_nsc_8' in label_features_to_extract:
+        # default, regression label case
+        y = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(y, random_augmentation=random_augmentation)
+    elif 'grasp_success' in label_features_to_extract or 'action_success' in label_features_to_extract:
+        if action_successes is None:
+            raise ValueError(
+                    'encode_label() was not provided with action_successes, '
+                    'which should contain data about the future outcome of the action.')
+        # classification label case
+        y = action_successes
+    elif 'stacking_reward' in label_features_to_extract:
+        y = current_stacking_reward
+    else:
+        raise ValueError('Unsupported label_features_to_extract: ' + str(label_features_to_extract))
+    return y
+
+
+def encode_action_and_images(
+        data_features_to_extract,
+        poses,
+        action_labels,
+        init_images,
+        current_images,
+        y=None,
+        random_augmentation=None,
+        encoded_goal_pose=None,
+        epsilon=1e-3):
+    """ Given an action and images, return the combined input object performing prediction with keras.
+
+    data_features_to_extract: A string identifier for the encoding to use for the actions and images.
+        Options include: 'image_0_image_n_vec_xyz_aaxyz_nsc_15', 'image_0_image_n_vec_xyz_10',
+            'current_xyz_aaxyz_nsc_8', 'current_xyz_3', 'proposed_goal_xyz_aaxyz_nsc_8'.
+    action_labels: batch of already one-hot or floating point encoded action label
+    init_images: batch of clear view images, the initial in the time series.
+        These should already be the appropriate size and rgb values in the range [0, 255].
+    current_images: batch of current image in the time series.
+        These should already be the appropriate size and rgb values in the range [0, 255].
+    y: labels, particularly useful when classifying the quality of a regressed action.
+    random_augmentation: None has no effect, if given a float from 0 to 1
+        it will modify the poses with a small amount of translation and rotation
+        with the probablity specified by the provided floating point number.
+    encoded_goal_pose: A pre-encoded goal pose for use in actor/critic classification of proposals.
+    """
+
+    action_labels = np.array(action_labels)
+    init_images = keras_applications.imagenet_utils._preprocess_numpy_input(
+        np.array(init_images, dtype=np.float32),
+        data_format='channels_last', mode='tf')
+    current_images = keras_applications.imagenet_utils._preprocess_numpy_input(
+        np.array(current_images, dtype=np.float32),
+        data_format='channels_last', mode='tf')
+    poses = np.array(poses)
+
+    # print('poses shape: ' + str(poses.shape))
+    encoded_poses = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(
+        poses, random_augmentation=random_augmentation)
+
+    if np.any(encoded_poses < 0 - epsilon) or np.any(encoded_poses > 1 + epsilon):
+        raise ValueError('An encoded pose was outside the [0,1] range! Update your encoding. poses: ' +
+                         str(poses) + ' encoded poses: ' + str(encoded_poses))
+
+    if (data_features_to_extract is None or
+            'current_xyz_3' in data_features_to_extract or
+            'image_0_image_n_vec_xyz_10' in data_features_to_extract or
+            'image_0_image_n_vec_xyz_nxygrid_12' in data_features_to_extract):
+        # regression input case for translation only
+        action_poses_vec = np.concatenate([encoded_poses[:, :3], action_labels], axis=-1)
+        X = [init_images, current_images, action_poses_vec]
+    elif (data_features_to_extract is None or
+            'current_xyz_aaxyz_nsc_8' in data_features_to_extract or
+            'image_0_image_n_vec_xyz_aaxyz_nsc_15' in data_features_to_extract or
+            'image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17' in data_features_to_extract):
+        # default, regression input case for translation and rotation
+        action_poses_vec = np.concatenate([encoded_poses, action_labels], axis=-1)
+        X = [init_images, current_images, action_poses_vec]
+    elif(data_features_to_extract is None or 'image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25' in data_features_to_extract):
+        # this is for classification of actions
+        action_poses_vec = np.concatenate([encoded_poses, encoded_goal_pose, action_labels], axis=-1)
+        X = [init_images, current_images, action_poses_vec]
+    elif 'proposed_goal_xyz_aaxyz_nsc_8' in data_features_to_extract:
+        # classification input case
+        proposed_and_current_action_vec = np.concatenate([encoded_poses, action_labels, y], axis=-1)
+        X = [init_images, current_images, proposed_and_current_action_vec]
+
+    else:
+        raise ValueError('Unsupported data input: ' + str(data_features_to_extract))
+
+    if (data_features_to_extract is not None and
+            ('image_0_image_n_vec_xyz_10' in data_features_to_extract or
+             'image_0_image_n_vec_xyz_aaxyz_nsc_15' in data_features_to_extract or
+             'image_0_image_n_vec_xyz_nxygrid_12' in data_features_to_extract or
+             'image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17' in data_features_to_extract or
+             'image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25' in data_features_to_extract)):
+        # make the giant data cube if it is requested
+        vec = np.squeeze(X[2:])
+        assert len(vec.shape) == 2, 'we only support a 2D input vector for now but found shape:' + str(vec.shape)
+        X = concat_images_with_tiled_vector_np(X[:2], vec)
+
+
+    # check if any of the data features expect nxygrid normalized x, y coordinate grid values
+    grid_labels = [s for s in data_features_to_extract if 'nxygrid' in s]
+    # print('grid labels: ' + str(grid_labels))
+    if (data_features_to_extract is not None and grid_labels):
+        X = concat_unit_meshgrid_np(X)
+    return X
+
+
+def inference_mode_gen(file_names):
+    """ Generate data for all time steps in a single example.
+    """
+    file_list_updated = []
+    # print(len(file_names))
+    for f_name in file_names:
+        with h5py.File(f_name, 'r') as data:
+            file_len = len(data['gripper_action_goal_idx']) - 1
+            # print(file_len)
+            list_id = [f_name] * file_len
+        file_list_updated = file_list_updated + list_id
+    return file_list_updated
+
+
+class CostarBlockStackingSequence(Sequence):
+    '''Generates a batch of data from the stacking dataset.
+
+    # TODO(ahundt) match the preprocessing /augmentation apis of cornell & google dataset
+    '''
+    def __init__(self, list_example_filenames,
+                 label_features_to_extract=None, data_features_to_extract=None,
+                 total_actions_available=41,
+                 batch_size=32, shuffle=False, seed=0,
+                 random_state=None,
+                 is_training=True, random_augmentation=None,
+                 random_shift=False,
+                 output_shape=None,
+                 blend_previous_goal_images=False,
+                 estimated_time_steps_per_example=250, verbose=0, inference_mode=False, one_hot_encoding=False):
+        '''Initialization
+
+        # Arguments
+
+        list_Ids: a list of file paths to be read
+        batch_size: specifies the size of each batch
+        shuffle: boolean to specify shuffle after each epoch
+        seed: a random seed to use. If seed is None it will be in order!
+        random_state: A numpy RandomState object, if not provided one will be generated from the seed.
+            Used exclusively for example data ordering and the indices to visit within an example.
+        # TODO(ahundt) better notes about the two parameters below. See choose_features_and_metrics() in cornell_grasp_trin.py.
+        label_features_to_extract: defaults to regression options, classification options are also available
+        data_features_to_extract: defaults to regression options, classification options are also available
+            Options include 'image_0_image_n_vec_xyz_aaxyz_nsc_15' which is a giant NHWC cube of image and pose data,
+            'current_xyz_aaxyz_nsc_8' a vector with the current pose,
+            'proposed_goal_xyz_aaxyz_nsc_8' a pose at the end of the current action (for classification cases),
+            'image_0_image_n_vec_xyz_nxygrid_12' another giant cube without rotation and with explicit normalized xy coordinates,
+            'image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17' another giant cube with rotation and explicit normalized xy coordinates.
+        random_augmentation: None or a float value between 0 and 1 indiciating how frequently random augmentation should be applied.
+        estimated_time_steps_per_example: The number of images in each example varies,
+            so we simply sample in proportion to an estimated number of images per example.
+            Due to random sampling, there is no guarantee that every image will be visited once!
+            However, the images can be visited in a fixed order, particularly when is_training=False.
+        one_hot_encoding flag triggers one hot encoding and thus numbers at the end of labels might not correspond to the actual size.
+
+        # Explanation of abbreviations:
+
+        aaxyz_nsc: is an axis and angle in xyz order, where the angle is defined by a normalized sin(theta) cos(theta).
+        nxygrid: at each pixel, concatenate two additional channels containing the pixel coordinate x and y as values between 0 and 1.
+            This is similar to uber's "coordconv" paper.
+        '''
+        if random_state is None:
+            random_state = RandomState(seed)
+        self.batch_size = batch_size
+        self.list_example_filenames = list_example_filenames
+        self.shuffle = shuffle
+        self.seed = seed
+        self.random_state = random_state
+        self.output_shape = output_shape
+        self.is_training = is_training
+        self.verbose = verbose
+        self.on_epoch_end()
+        if isinstance(label_features_to_extract, str):
+            label_features_to_extract = [label_features_to_extract]
+        self.label_features_to_extract = label_features_to_extract
+        # TODO(ahundt) total_actions_available can probably be extracted from the example hdf5 files and doesn't need to be a param
+        if isinstance(data_features_to_extract, str):
+            data_features_to_extract = [data_features_to_extract]
+        self.data_features_to_extract = data_features_to_extract
+        self.total_actions_available = total_actions_available
+        self.random_augmentation = random_augmentation
+        self.random_shift = random_shift
+        self.inference_mode = inference_mode
+        self.infer_index = 0
+        self.one_hot_encoding = one_hot_encoding
+
+        self.blend = blend_previous_goal_images
+        self.estimated_time_steps_per_example = estimated_time_steps_per_example
+        if self.inference_mode is True:
+            self.list_example_filenames = inference_mode_gen(self.list_example_filenames)
+        # if crop_shape is None:
+        #     # height width 3
+        #     crop_shape = (224, 224, 3)
+        # self.crop_shape = crop_shape
+
+    def __len__(self):
+        """Denotes the number of batches per epoch
+        """
+        return int(np.floor(len(self.list_example_filenames) / self.batch_size))
+
+    def __getitem__(self, index):
+        """Generate one batch of data
+        """
+        # Generate indexes of the batch
+        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
+        if self.verbose > 0:
+            print("batch getitem indices:" + str(indexes))
+        # Find list of example_filenames
+        list_example_filenames_temp = [self.list_example_filenames[k] for k in indexes]
+        # Generate data
+        self.infer_index = self.infer_index + 1
+        X, y = self.__data_generation(list_example_filenames_temp, self.infer_index)
+
+        return X, y
+
+    def get_estimated_time_steps_per_example(self):
+        """ Get the estimated images per example,
+
+        Run extra steps in proportion to this if you want to get close to visiting every image.
+        """
+        return self.estimated_time_steps_per_example
+
+    def on_epoch_end(self):
+        """ Updates indexes after each epoch
+        """
+        if self.seed is not None and not self.is_training:
+            # repeat the same order if we're validating or testing
+            # continue the large random sequence for training
+            self.random_state.seed(self.seed)
+        self.indexes = np.arange(len(self.list_example_filenames))
+        if self.shuffle is True:
+            self.random_state.shuffle(self.indexes)
+
+    def __data_generation(self, list_Ids, images_index):
+        """ Generates data containing batch_size samples
+
+        # Arguments
+
+        list_Ids: a list of file paths to be read
+        """
+
+        def JpegToNumpy(jpeg):
+            stream = io.BytesIO(jpeg)
+            im = np.asarray(Image.open(stream))
+            try:
+                return im.astype(np.uint8)
+            except(TypeError) as exception:
+                print("Failed to convert PIL image type", exception)
+                print("type ", type(im), "len ", len(im))
+
+        def ConvertImageListToNumpy(data, format='numpy', data_format='NHWC'):
+            """ Convert a list of binary jpeg or png files to numpy format.
+
+            # Arguments
+
+            data: a list of binary jpeg images to convert
+            format: default 'numpy' returns a 4d numpy array,
+                'list' returns a list of 3d numpy arrays
+            """
+            length = len(data)
+            imgs = []
+            for raw in data:
+                img = JpegToNumpy(raw)
+                if data_format == 'NCHW':
+                    img = np.transpose(img, [2, 0, 1])
+                imgs.append(img)
+            if format == 'numpy':
+                imgs = np.array(imgs)
+            return imgs
+        try:
+            # Initialization
+            if self.verbose > 0:
+                print("generating batch: " + str(list_Ids))
+            X = []
+            init_images = []
+            current_images = []
+            poses = []
+            goal_pose = []
+            y = []
+            action_labels = []
+            action_successes = []
+            example_filename = ''
+            if isinstance(list_Ids, int):
+                # if it is just a single int
+                # make it a list so we can iterate
+                list_Ids = [list_Ids]
+
+            # Generate data
+            for i, example_filename in enumerate(list_Ids):
+                example_filename = os.path.expanduser(example_filename)
+                if self.verbose > 0:
+                    print('reading: ' + str(i) + ' path: ' + str(example_filename))
+                # Store sample
+                # X[i,] = np.load('data/' + example_filename + '.npy')
+                x = ()
+                try:
+                    if not os.path.isfile(example_filename):
+                        raise ValueError('CostarBlockStackingSequence: Trying to open something which is not a file: ' + str(example_filename))
+                    with h5py.File(example_filename, 'r') as data:
+                        if 'gripper_action_goal_idx' not in data or 'gripper_action_label' not in data:
+                            raise ValueError('block_stacking_reader.py: You need to run preprocessing before this will work! \n' +
+                                             '    python2 ctp_integration/scripts/view_convert_dataset.py --path ~/.keras/datasets/costar_block_stacking_dataset_v0.4 --preprocess_inplace gripper_action --write'
+                                             '\n File with error: ' + str(example_filename))
+                        # indices = [0]
+                        # len of goal indexes is the same as the number of images, so this saves loading all the images
+                        all_goal_ids = np.array(data['gripper_action_goal_idx'])
+                        if('stacking_reward' in self.label_features_to_extract):
+                            # TODO(ahundt) move this check out of the stacking reward case after files have been updated
+                            if all_goal_ids[-1] > len(all_goal_ids):
+                                raise ValueError(' File contains goal id greater than total number of frames ' + str(example_filename))
+                        if len(all_goal_ids) < 2:
+                            print('block_stacking_reader.py: ' + str(len(all_goal_ids)) + ' goal indices in this file, skipping: ' + example_filename)
+                        if 'success' in example_filename:
+                            label_constant = 1
+                        else:
+                            label_constant = 0
+                        stacking_reward = np.arange(len(all_goal_ids))
+                        stacking_reward = 0.999 * stacking_reward * label_constant
+                        # print("reward estimates", stacking_reward)
+
+                        if self.seed is not None:
+                            rand_max = len(all_goal_ids) - 1
+                            if rand_max <= 1:
+                                print('CostarBlockStackingSequence: not enough goal ids: ' + str(all_goal_ids) + ' file: ' + str(rand_max))
+                            image_indices = self.random_state.randint(1, rand_max, 1)
+                        else:
+                            raise NotImplementedError
+                        indices = [0] + list(image_indices)
+
+                        if self.blend:
+                            img_indices = get_past_goal_indices(image_indices, all_goal_ids, filename=example_filename)
+                        else:
+                            img_indices = indices
+                        if self.inference_mode is True:
+                            if images_index >= len(data['gripper_action_goal_idx']):
+                                self.infer_index = 1
+                                image_idx = 1
+                                # image_idx = (images_index % (len(data['gripper_action_goal_idx']) - 1)) + 1
+                            else:
+                                image_idx = images_index
+
+                            img_indices = [0, image_idx]
+                            # print("image_index", image_idx)
+                            # print("image_true", images_index, len(data['gripper_action_goal_idx']))
+                            # print("new_indices-----", image_idx)
+                        if self.verbose > 0:
+                            print("Indices --", indices)
+                            print('img_indices: ' + str(img_indices))
+                        rgb_images = list(data['image'][img_indices])
+                        rgb_images = ConvertImageListToNumpy(rgb_images, format='numpy')
+
+                        if self.blend:
+                            # TODO(ahundt) move this to after the resize loop for a speedup
+                            blended_image = blend_image_sequence(rgb_images)
+                            rgb_images = [rgb_images[0], blended_image]
+                        # resize using skimage
+                        rgb_images_resized = []
+                        for k, images in enumerate(rgb_images):
+                            if (self.is_training and self.random_augmentation is not None and
+                                    self.random_shift and np.random.random() > self.random_augmentation):
+                                # apply random shift to the images before resizing
+                                images = keras_preprocessing.image.random_shift(
+                                    images,
+                                    # height, width
+                                    1./(48. * 2.), 1./(64. * 2.),
+                                    row_axis=0, col_axis=1, channel_axis=2)
+                            # TODO(ahundt) improve crop/resize to match cornell_grasp_dataset_reader
+                            if self.output_shape is not None:
+                                resized_image = resize(images, self.output_shape, mode='constant', preserve_range=True)
+                            else:
+                                resized_image = images
+                            if self.is_training and self.random_augmentation:
+                                # do some image augmentation with random erasing & cutout
+                                resized_image = random_eraser(resized_image)
+                            rgb_images_resized.append(resized_image)
+
+                        init_images.append(rgb_images_resized[0])
+                        current_images.append(rgb_images_resized[1])
+                        poses.append(np.array(data['pose'][indices[1:]])[0])
+                        if(self.data_features_to_extract is not None and 'image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25' in self.data_features_to_extract):
+                            next_goal_idx = all_goal_ids[indices[1:][0]]
+                            goal_pose.append(np.array(data['pose'][next_goal_idx]))
+                            print("final pose added", goal_pose)
+                            current_stacking_reward = stacking_reward[indices[1]]
+                            print("reward estimate", current_stacking_reward)
+                        # x = x + tuple([rgb_images[indices]])
+                        # x = x + tuple([np.array(data['pose'])[indices]])
+
+                        if (self.data_features_to_extract is not None and
+                                ('image_0_image_n_vec_xyz_aaxyz_nsc_15' in self.data_features_to_extract or
+                                 'image_0_image_n_vec_xyz_nxygrid_12' in self.data_features_to_extract or
+                                 'image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17' in self.data_features_to_extract or
+                                 'image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25' in self.data_features_to_extract) and not self.one_hot_encoding):
+                            # normalized floating point encoding of action vector
+                            # from 0 to 1 in a single float which still becomes
+                            # a 2d array of dimension batch_size x 1
+                            # np.expand_dims(data['gripper_action_label'][indices[1:]], axis=-1) / self.total_actions_available
+                            for j in indices[1:]:
+                                action = [float(data['gripper_action_label'][j] / self.total_actions_available)]
+                                action_labels.append(action)
+                        else:
+                            # one hot encoding
+                            for j in indices[1:]:
+                                # generate the action label one-hot encoding
+                                action = np.zeros(self.total_actions_available)
+                                action[data['gripper_action_label'][j]] = 1
+                                action_labels.append(action)
+                        # action_labels = np.array(action_labels)
+
+                        # print(action_labels)
+                        # x = x + tuple([action_labels])
+                        # X.append(x)
+                        # action_labels = np.unique(data['gripper_action_label'])
+                        # print(np.array(data['labels_to_name']).shape)
+                        # X.append(np.array(data['pose'])[indices])
+
+                        # Store class
+                        label = ()
+                        # change to goals computed
+                        index1 = indices[1]
+                        goal_ids = all_goal_ids[index1]
+                        # print(index1)
+                        label = np.array(data['pose'])[goal_ids]
+                        # print(type(label))
+                        # for items in list(data['all_tf2_frames_from_base_link_vec_quat_xyzxyzw_json'][indices]):
+                        #     json_data = json.loads(items.decode('UTF-8'))
+                        #     label = label + tuple([json_data['gripper_center']])
+                        #     print(np.array(json_data['gripper_center']))
+                            # print(json_data.keys())
+                            # y.append(np.array(json_data['camera_rgb_frame']))
+                        if('stacking_reward' in self.label_features_to_extract):
+                            # print(y)
+                            y.append(current_stacking_reward)
+                        else:
+                            y.append(label)
+                        if 'success' in example_filename:
+                            action_successes = action_successes + [1]
+                        else:
+                            action_successes = action_successes + [0]
+                        # print("y = ", y)
+                except IOError as ex:
+                    print('Error: Skipping file due to IO error when opening ' +
+                          example_filename + ': ' + str(ex) + ' using the last example twice for batch')
+
+            action_labels = np.array(action_labels)
+            init_images = keras_applications.imagenet_utils._preprocess_numpy_input(
+                np.array(init_images, dtype=np.float32),
+                data_format='channels_last', mode='tf')
+            current_images = keras_applications.imagenet_utils._preprocess_numpy_input(
+                np.array(current_images, dtype=np.float32),
+                data_format='channels_last', mode='tf')
+            poses = np.array(poses)
+
+            # print('poses shape: ' + str(poses.shape))
+            encoded_poses = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(
+                poses, random_augmentation=self.random_augmentation)
+            if self.data_features_to_extract is None or 'image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25':
+                encoded_goal_pose = grasp_metrics.batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(
+                    poses, random_augmentation=self.random_augmentation)
+                # encoded_poses = np.array([encoded_poses, encoded_goal_pose])
+
+            # print('encoded poses shape: ' + str(encoded_poses.shape))
+            # print('action labels shape: ' + str(action_labels.shape))
+            # print('encoded poses vec shape: ' + str(action_poses_vec.shape))
+            # print("---",init_images.shape)
+            # init_images = tf.image.resize_images(init_images,[224,224])
+            # current_images = tf.image.resize_images(current_images,[224,224])
+            # print("---",init_images.shape)
+            # X = init_images
+            X = encode_action_and_images(
+                    data_features_to_extract=self.data_features_to_extract,
+                    poses=poses, action_labels=action_labels,
+                    init_images=init_images, current_images=current_images,
+                    y=y, random_augmentation=self.is_training)
+
+            # print("type=======",type(X))
+            # print("shape=====",X.shape)
+
+            # determine the label
+            if('stacking_reward' in self.label_features_to_extract):
+                y = encode_label(self.label_features_to_extract, y, action_successes, self.random_augmentation, current_stacking_reward)
+            else:
+                y = encode_label(self.label_features_to_extract, y, action_successes, self.random_augmentation, None)
+
+            # Debugging checks
+            if X is None:
+                raise ValueError('Unsupported input data for X: ' + str(x))
+            if y is None:
+                raise ValueError('Unsupported input data for y: ' + str(x))
+
+            # Assemble the data batch
+            batch = (X, y)
+
+            if self.verbose > 0:
+                # diff should be nonzero for most timesteps except just before the gripper closes!
+                print('encoded current poses: ' + str(poses) + ' labels: ' + str(y))
+                # commented next line due to dimension issue
+                # + ' diff: ' + str(poses - y))
+                print("generated batch: " + str(list_Ids))
+        except Exception as ex:
+            print('CostarBlockStackingSequence: Keras will often swallow exceptions without a stack trace, '
+                  'so we are printing the stack trace here before re-raising the error.')
+            ex_type, ex, tb = sys.exc_info()
+            traceback.print_tb(tb)
+            # deletion must be explicit to prevent leaks
+            # https://stackoverflow.com/a/16946886/99379
+            del tb
+            raise
+
+        return batch
+
+
+def block_stacking_generator(sequence):
+
+    # training_generator = CostarBlockStackingSequence(filenames, batch_size=1)
+    epoch_size = len(sequence)
+    step = 0
+    while True:
+        if step > epoch_size:
+            step = 0
+            sequence.on_epoch_end()
+        batch = sequence.__getitem__(step)
+        print(np.array(batch).shape)
+        print(np.array(batch[0][0]).shape)
+        exit()
+        step += 1
+        yield batch
+
+
+if __name__ == "__main__":
+    visualize = False
+    output_shape = (224, 224, 3)
+    # output_shape = None
+    tf.enable_eager_execution()
+    filenames = glob.glob(os.path.expanduser('~/.keras/datasets/costar_block_stacking_dataset_v0.4/*success.h5f'))
+    # print(filenames)
+    # filenames_new = inference_mode_gen(filenames)
+    training_generator = CostarBlockStackingSequence(
+        filenames, batch_size=1, verbose=1,
+        output_shape=output_shape,
+        label_features_to_extract='grasp_goal_xyz_aaxyz_nsc_8',
+        data_features_to_extract=['current_xyz_aaxyz_nsc_8'],
+        blend_previous_goal_images=False, inference_mode=False)
+    num_batches = len(training_generator)
+    print(num_batches)
+    # print(len(filenames_new))
+
+    bsg = block_stacking_generator(training_generator)
+    iter(bsg)
+    from tqdm import tqdm as tqdm
+    progress = tqdm(range(num_batches))
+    for i in progress:
+        data = next(bsg)
+        progress.set_description('step: ' + str(i) + ' data type: ' + str(type(data)))
+
+        if visualize:
+            import matplotlib
+            import matplotlib.pyplot as plt
+            # clear view image
+            plt.imshow((np.squeeze(data[0][0]) / 2.0) + 0.5)
+            plt.draw()
+            plt.pause(0.25)
+            # current timestep image
+            plt.imshow((np.squeeze(data[0][1]) / 2.0) + 0.5)
+            plt.draw()
+            plt.pause(0.25)
+            # uncomment the following line to wait for
+            # one window to be closed before showing the next
+            # plt.show()
+    # a = next(training_generator)
+    enqueuer = OrderedEnqueuer(
+                    training_generator,
+                    use_multiprocessing=False,
+                    shuffle=True)
+    enqueuer.start(workers=1, max_queue_size=1)
+    generator = iter(enqueuer.get())
+    print("-------------------")
+    generator_ouput = next(generator)
+    print("-------------------op")
+    x, y = generator_ouput
+    print("x-shape-----------", x.shape)
+    print("y-shape---------",y.shape)
+
+    # X,y=training_generator.__getitem__(1)
+    #print(X.keys())
+    # print(X[0].shape)
+    # print(X[0].shape)
+    # print(y[0])
diff --git a/enas/cifar10/data_utils.py b/enas/cifar10/data_utils.py
index 1f8d615..f727f50 100644
--- a/enas/cifar10/data_utils.py
+++ b/enas/cifar10/data_utils.py
@@ -53,6 +53,7 @@ def _read_fmnist_data(data_path):
   labels["train"] = np.array(data.train.labels, dtype = np.int32)
   labels["test"] = np.array(data.test.labels, dtype = np.int32)
   print("Read and processed data..")
+  print(labels["test"])
 
   return images, labels
 
@@ -80,6 +81,9 @@ def read_data(data_path, num_valids=5000, dataset = "cifar"):
     images, labels = valid_split_data(images, labels, num_valids)
     return images, labels
 
+  if dataset == "stacking":
+    images["path"] = data_path
+    return images, labels
   else:
     train_files = [
       "data_batch_1",
diff --git a/enas/cifar10/general_child.py b/enas/cifar10/general_child.py
index 9896c7f..cd4f598 100644
--- a/enas/cifar10/general_child.py
+++ b/enas/cifar10/general_child.py
@@ -12,10 +12,11 @@
 from enas.cifar10.image_ops import conv
 from enas.cifar10.image_ops import fully_connected
 from enas.cifar10.image_ops import batch_norm
+from enas.cifar10.image_ops import norm
 from enas.cifar10.image_ops import batch_norm_with_mask
 from enas.cifar10.image_ops import relu
 from enas.cifar10.image_ops import max_pool
-from enas.cifar10.image_ops import global_avg_pool
+from enas.cifar10.image_ops import global_max_pool
 
 from enas.utils import count_model_params
 from enas.utils import get_train_ops
@@ -101,8 +102,10 @@ def _get_C(self, x):
       x: tensor of shape [N, H, W, C] or [N, C, H, W]
     """
     if self.data_format == "NHWC":
+      assert x.get_shape().as_list()[3] is not None
       return x.get_shape()[3].value
     elif self.data_format == "NCHW":
+      assert x.get_shape().as_list()[1] is not None
       return x.get_shape()[1].value
     else:
       raise ValueError("Unknown data_format '{0}'".format(self.data_format))
@@ -112,6 +115,7 @@ def _get_HW(self, x):
     Args:
       x: tensor of shape [N, H, W, C] or [N, C, H, W]
     """
+    assert x.get_shape().as_list()[2] is not None
     return x.get_shape()[2].value
 
   def _get_strides(self, stride):
@@ -136,7 +140,7 @@ def _factorized_reduction(self, x, out_filters, stride, is_training):
         w = create_weight("w", [1, 1, inp_c, out_filters])
         x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
                          data_format=self.data_format)
-        x = batch_norm(x, is_training, data_format=self.data_format)
+        x = norm(x, is_training, data_format=self.data_format)
         return x
 
     stride_spec = self._get_strides(stride)
@@ -171,7 +175,7 @@ def _factorized_reduction(self, x, out_filters, stride, is_training):
 
     # Concat and apply BN
     final_path = tf.concat(values=[path1, path2], axis=concat_axis)
-    final_path = batch_norm(final_path, is_training,
+    final_path = norm(final_path, is_training,
                             data_format=self.data_format)
 
     return final_path
@@ -194,11 +198,11 @@ def _model(self, images, is_training, reuse=False):
       layers = []
 
       out_filters = self.out_filters
-      C = self._get_C(images) 
+      C = self._get_C(images)
       with tf.variable_scope("stem_conv"):
         w = create_weight("w", [C, C, C, out_filters])
         x = tf.nn.conv2d(images, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-        x = batch_norm(x, is_training, data_format=self.data_format)
+        x = norm(x, is_training, data_format=self.data_format)
         layers.append(x)
 
       if self.whole_channels:
@@ -229,7 +233,7 @@ def _model(self, images, is_training, reuse=False):
           start_idx += 2 * self.num_branches + layer_id
         print(layers[-1])
 
-      x = global_avg_pool(x, data_format=self.data_format)
+      x = global_max_pool(x, data_format=self.data_format)
       if is_training:
         x = tf.nn.dropout(x, self.keep_prob)
       with tf.variable_scope("fc"):
@@ -351,8 +355,8 @@ def _enas_layer(self, layer_id, prev_layers, start_idx, out_filters, is_training
           branches = tf.reshape(branches, [N, -1, H, W])
         out = tf.nn.conv2d(
           branches, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-        out = batch_norm(out, is_training, data_format=self.data_format)
-        out = tf.nn.relu(out)
+        out = norm(out, is_training, data_format=self.data_format)
+        out = tf.nn.elu(out)
 
     if layer_id > 0:
       if self.whole_channels:
@@ -368,7 +372,7 @@ def _enas_layer(self, layer_id, prev_layers, start_idx, out_filters, is_training
                                     lambda: tf.zeros_like(prev_layers[i])))
         res_layers.append(out)
         out = tf.add_n(res_layers)
-        out = batch_norm(out, is_training, data_format=self.data_format)
+        out = norm(out, is_training, data_format=self.data_format)
 
     return out
 
@@ -396,17 +400,17 @@ def _fixed_layer(
         filter_size = size[count]
         with tf.variable_scope("conv_1x1"):
           w = create_weight("w", [1, 1, inp_c, out_filters])
-          out = tf.nn.relu(inputs)
+          out = tf.nn.elu(inputs)
           out = tf.nn.conv2d(out, w, [1, 1, 1, 1], "SAME",
                              data_format=self.data_format)
-          out = batch_norm(out, is_training, data_format=self.data_format)
+          out = norm(out, is_training, data_format=self.data_format)
 
         with tf.variable_scope("conv_{0}x{0}".format(filter_size)):
           w = create_weight("w", [filter_size, filter_size, out_filters, out_filters])
-          out = tf.nn.relu(out)
+          out = tf.nn.elu(out)
           out = tf.nn.conv2d(out, w, [1, 1, 1, 1], "SAME",
                              data_format=self.data_format)
-          out = batch_norm(out, is_training, data_format=self.data_format)
+          out = norm(out, is_training, data_format=self.data_format)
       elif count == 4:
         pass
       elif count == 5:
@@ -449,10 +453,10 @@ def _fixed_layer(
           branches = tf.concat(branches, axis=3)
         elif self.data_format == "NCHW":
           branches = tf.concat(branches, axis=1)
-        out = tf.nn.relu(branches)
+        out = tf.nn.elu(branches)
         out = tf.nn.conv2d(out, w, [1, 1, 1, 1], "SAME",
                            data_format=self.data_format)
-        out = batch_norm(out, is_training, data_format=self.data_format)
+        out = norm(out, is_training, data_format=self.data_format)
 
     if layer_id > 0:
       if self.whole_channels:
@@ -477,10 +481,10 @@ def _fixed_layer(
       with tf.variable_scope("skip"):
         w = create_weight(
           "w", [1, 1, total_skip_channels * out_filters, out_filters])
-        out = tf.nn.relu(out)
+        out = tf.nn.elu(out)
         out = tf.nn.conv2d(
           out, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-        out = batch_norm(out, is_training, data_format=self.data_format)
+        out = norm(out, is_training, data_format=self.data_format)
 
     return out
 
@@ -504,8 +508,8 @@ def _conv_branch(self, inputs, filter_size, is_training, count, out_filters,
     with tf.variable_scope("inp_conv_1"):
       w = create_weight("w", [1, 1, inp_c, out_filters])
       x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-      x = batch_norm(x, is_training, data_format=self.data_format)
-      x = tf.nn.relu(x)
+      x = norm(x, is_training, data_format=self.data_format)
+      x = tf.nn.elu(x)
 
     with tf.variable_scope("out_conv_{}".format(filter_size)):
       if start_idx is None:
@@ -515,12 +519,13 @@ def _conv_branch(self, inputs, filter_size, is_training, count, out_filters,
           w_point = create_weight("w_point", [1, 1, out_filters * ch_mul, count])
           x = tf.nn.separable_conv2d(x, w_depth, w_point, strides=[1, 1, 1, 1],
                                      padding="SAME", data_format=self.data_format)
-          x = batch_norm(x, is_training, data_format=self.data_format)
+          x = norm(x, is_training, data_format=self.data_format)
         else:
           w = create_weight("w", [filter_size, filter_size, inp_c, count])
           x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-          x = batch_norm(x, is_training, data_format=self.data_format)
+          x = norm(x, is_training, data_format=self.data_format)
       else:
+        print('TODO(ahundt) batch_norm_with_mask is definitely called... make a group norm version!')
         if separable:
           w_depth = create_weight("w_depth", [filter_size, filter_size, out_filters, ch_mul])
           w_point = create_weight("w_point", [out_filters, out_filters * ch_mul])
@@ -544,7 +549,7 @@ def _conv_branch(self, inputs, filter_size, is_training, count, out_filters,
           mask = tf.logical_and(start_idx <= mask, mask < start_idx + count)
           x = batch_norm_with_mask(
             x, is_training, mask, out_filters, data_format=self.data_format)
-      x = tf.nn.relu(x)
+      x = tf.nn.elu(x)
     return x
 
   def _pool_branch(self, inputs, is_training, count, avg_or_max, start_idx=None):
@@ -566,8 +571,8 @@ def _pool_branch(self, inputs, is_training, count, avg_or_max, start_idx=None):
     with tf.variable_scope("conv_1"):
       w = create_weight("w", [1, 1, inp_c, self.out_filters])
       x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-      x = batch_norm(x, is_training, data_format=self.data_format)
-      x = tf.nn.relu(x)
+      x = norm(x, is_training, data_format=self.data_format)
+      x = tf.nn.elu(x)
 
     with tf.variable_scope("pool"):
       if self.data_format == "NHWC":
diff --git a/enas/cifar10/grasp_metrics.py b/enas/cifar10/grasp_metrics.py
new file mode 100644
index 0000000..8d8cf5c
--- /dev/null
+++ b/enas/cifar10/grasp_metrics.py
@@ -0,0 +1,1366 @@
+
+import os
+import copy
+import math
+import numpy as np
+from tqdm import tqdm
+
+import keras
+import tensorflow as tf
+from tensorflow.python.platform import flags
+from shapely.geometry import Polygon
+from pyquaternion import Quaternion
+import sklearn
+
+import grasp_utilities
+
+# class Vector:
+#     # http://www.mathopenref.com/coordpolygonarea.html
+#     # https://stackoverflow.com/a/45268241/99379
+#     def __init__(self, x, y):
+#         self.x = x
+#         self.y = y
+
+#     def __add__(self, v):
+#         if not isinstance(v, Vector):
+#             return NotImplemented
+#         return Vector(self.x + v.x, self.y + v.y)
+
+#     def __sub__(self, v):
+#         if not isinstance(v, Vector):
+#             return NotImplemented
+#         return Vector(self.x - v.x, self.y - v.y)
+
+#     def cross(self, v):
+#         if not isinstance(v, Vector):
+#             return NotImplemented
+#         return self.x*v.y - self.y*v.x
+
+
+# class Line:
+#     # ax + by + c = 0
+#     def __init__(self, v1, v2):
+#         self.a = v2.y - v1.y
+#         self.b = v1.x - v2.x
+#         self.c = v2.cross(v1)
+
+#     def __call__(self, p):
+#         return self.a*p.x + self.b*p.y + self.c
+
+#     def intersection(self, other):
+#         # http://www.mathopenref.com/coordpolygonarea.html
+#         # https://stackoverflow.com/a/45268241/99379
+#         # See e.g.     https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection#Using_homogeneous_coordinates
+#         if not isinstance(other, Line):
+#             return NotImplemented
+#         w = self.a*other.b - self.b*other.a
+#         return Vector(
+#             (self.b*other.c - self.c*other.b)/w,
+#             (self.c*other.a - self.a*other.c)/w
+#         )
+
+
+# def rectangle_vertices(cx, cy, w, h, theta):
+#     # http://www.mathopenref.com/coordpolygonarea.html
+#     # https://stackoverflow.com/a/45268241/99379
+#     dx = w/2
+#     dy = h/2
+#     dxcos = dx*cos(theta)
+#     dxsin = dx*sin(theta)
+#     dycos = dy*cos(theta)
+#     dysin = dy*sin(theta)
+#     return (
+#         Vector(cx, cy) + Vector(-dxcos - -dysin, -dxsin + -dycos),
+#         Vector(cx, cy) + Vector( dxcos - -dysin,  dxsin + -dycos),
+#         Vector(cx, cy) + Vector( dxcos -  dysin,  dxsin +  dycos),
+#         Vector(cx, cy) + Vector(-dxcos -  dysin, -dxsin +  dycos)
+#     )
+
+# def intersection_area(r1, r2):
+#     # http://www.mathopenref.com/coordpolygonarea.html
+#     # https://stackoverflow.com/a/45268241/99379
+#     # r1 and r2 are in (center, width, height, rotation) representation
+#     # First convert these into a sequence of vertices
+
+#     rect0 = rectangle_vertices(*r1)
+#     rect1 = rectangle_vertices(*r2)
+
+#     # Use the vertices of the first rectangle as
+#     # starting vertices of the intersection polygon.
+#     rect0 = rect0
+
+#     # Loop over the edges of the second rectangle
+#     for p, q in zip(rect1, rect1[1:] + rect1[:1]):
+#         if len(rect0) <= 2:
+#             break # No intersection
+
+#         line = Line(p, q)
+
+#         # Any point p with line(p) <= 0 is on the "inside" (or on the boundary),
+#         # any point p with line(p) > 0 is on the "outside".
+
+#         # Loop over the edges of the rect0 polygon,
+#         # and determine which part is inside and which is outside.
+#         new_intersection = []
+#         line_values = [line(t) for t in rect0]
+#         for s, t, s_value, t_value in zip(
+#                 rect0, rect0[1:] + rect0[:1],
+#                 line_values, line_values[1:] + line_values[:1]):
+#             if s_value <= 0:
+#                 new_intersection.append(s)
+#             if s_value * t_value < 0:
+#                 # Points are on opposite sides.
+#                 # Add the intersection of the lines to new_intersection.
+#                 intersection_point = line.intersection(Line(s, t))
+#                 new_intersection.append(intersection_point)
+
+#         intersection = new_intersection
+
+#     # Calculate area
+#     if len(intersection) <= 2:
+#         return 0
+
+#     return 0.5 * sum(p.x*q.y - p.y*q.x for p, q in
+#                      zip(intersection, intersection[1:] + intersection[:1]))
+
+
+# intersection_area(r0y0, r0x0, r0y1, r0x1, r0y2, r0x2, r0y3, r0x3, r1y0, r1x0, r1y1, r1x1, r1y2, r1x2,  r1y3, r1x3):
+def rectangle_points(r0y0, r0x0, r0y1, r0x1, r0y2, r0x2, r0y3, r0x3):
+    p0yx = np.array([r0y0, r0x0])
+    p1yx = np.array([r0y1, r0x1])
+    p2yx = np.array([r0y2, r0x2])
+    p3yx = np.array([r0y3, r0x3])
+    return [p0yx, p1yx, p2yx, p3yx]
+
+
+def rectangle_vectors(rp):
+    """
+    # Arguments
+
+    rp: rectangle points [p0yx, p1yx, p2yx, p3yx]
+    """
+    v0 = rp[1] - rp[0]
+    v1 = rp[2] - rp[1]
+    v2 = rp[3] - rp[2]
+    v3 = rp[0] - rp[3]
+
+    return [v0, v1, v2, v3]
+
+
+def rectangle_homogeneous_lines(rv):
+    """
+
+    # Arguments
+
+    rv: rectangle vectors [v0yx, v1yx, v2yx, v3yx]
+
+
+    # Returns
+
+    [r0abc, r1abc, r2abc, r3abc]
+
+    """
+    # ax + by + c = 0
+    dv = rv[0] - rv[1]
+    # TODO(ahundt) make sure cross product doesn't need to be in xy order
+    r0abc = K.concatenate([dv[0], dv[1], tf.cross(rv[0], rv[1])])
+    dv = rv[1] - rv[2]
+    r1abc = K.concatenate([dv[1], dv[2], tf.cross(rv[1], rv[2])])
+    dv = rv[2] - rv[3]
+    r2abc = K.concatenate([dv[2], dv[3], tf.cross(rv[2], rv[3])])
+    dv = rv[3] - rv[0]
+    r3abc = K.concatenate([dv[3], dv[0], tf.cross(rv[3], rv[0])])
+    return [r0abc, r1abc, r2abc, r3abc]
+
+
+def homogeneous_line_intersection(hl0abc, hl1abc):
+    """ Given two homogenous lines return the intersection point in y,x coordinates
+    """
+    a0 = hl0abc[0]
+    b0 = hl0abc[1]
+    c0 = hl0abc[2]
+    a1 = hl1abc[0]
+    b1 = hl1abc[1]
+    c1 = hl1abc[2]
+    w = a0 * b1 - b0 * a1
+    py = (c0 * a1 - a0 * c1) / w
+    px = (b0 * c1 - c0 * b1) / w
+    return [py, px]
+
+
+def line_at_point(l_abc, p_yx):
+    """
+
+    # Arguments
+
+    l_abc: a line in homogenous coodinates
+    p_yx: a point with y, x coordinates
+    """
+    return l_abc[0] * p_yx[1] + l_abc[1] * p_yx[0] + l_abc[2]
+
+
+def intersection_points(rl0, rp1):
+    """ Evaluate rectangle lines at another rectangle's points
+    """
+    lv = [
+        line_at_point(rl0[0], rp1[0]),
+        line_at_point(rl0[1], rp1[1]),
+        line_at_point(rl0[2], rp1[2]),
+        line_at_point(rl0[3], rp1[3]),
+    ]
+    return lv
+
+
+def rectangle_intersection_polygon(rp0, rl0, rp1, rl1):
+    """ Given two homogenous line rectangles, it returns the points for the polygon representing their intersection.
+
+    # Arguments
+
+    rp0: rectangle 0 defined with points
+    rl0: rectangle 0 defined with homogeneous lines
+    rp1: rectangle 1 defined with points
+    rp1: rectangle 1 defined with homogeneous lines
+
+    # Returns
+
+    Intersection polygon consisting of up to 8 points.
+    """
+    # TODO(ahundt) this function is still set up for eager execution... figure it out as tf calls...
+    # http://www.mathopenref.com/coordpolygonarea.html
+    # https://stackoverflow.com/a/45268241/99379
+    # Use the vertices of the first rectangle as
+    # starting vertices of the intersection polygon.
+    intersection = []
+    for line1 in rl1:
+        line_values = [line_at_point(line1, t) for t in rp0]
+
+        # Any point p with line(p) <= 0 is on the "inside" (or on the boundary),
+        # any point p with line(p) > 0 is on the "outside".
+
+        # Loop over the edges of the rect0 polygon,
+        # and determine which part is inside and which is outside.
+        new_intersection = []
+        # points in rp0 rotated around by one
+        rp0_rot = grasp_utilities.rotate(rp0)
+        line_values_rot = grasp_utilities.rotate(line_values)
+        for s, t, s_value, t_value, line0 in zip(
+                rp0, rp0_rot, line_values, line_values_rot, rl0):
+
+            if s_value <= 0:
+                new_intersection.append(s)
+
+            st_value = s_value * t_value
+            intersection_point = homogeneous_line_intersection(line1, line0)
+            if st_value < 0:
+                # Points are on opposite sides.
+                # Add the intersection of the lines to new_intersection.
+                new_intersection.append(intersection_point)
+
+        intersection = new_intersection
+
+    return intersection
+
+
+def polygon_area_four_points(rp):
+    """
+    # Arguments
+
+    rp: polygon defined by 4 points in y,x order
+    """
+    # partial = p0x * p1y - p0y * p1x
+    partial0 = rp[0][1] * rp[1][0] - rp[0][0] * rp[1][1]
+    partial1 = rp[1][1] * rp[2][0] - rp[1][0] * rp[2][1]
+    partial2 = rp[2][1] * rp[3][0] - rp[2][0] * rp[3][1]
+    partial3 = rp[3][1] * rp[0][0] - rp[3][0] * rp[0][1]
+    full_sum = partial0 + partial1 + partial2 + partial3
+    return 0.5 * full_sum
+
+
+def polygon_area(poly):
+    # Calculate area
+    if len(poly) <= 2:
+        return 0
+
+    poly_rot = poly[1:] + poly[:1]
+
+    return 0.5 * sum(p[1]*q[0] - p[0]*q[1] for p, q in zip(poly, poly_rot))
+
+
+def rectangle_vertices(h, w, cy, cx, sin_theta=None, cos_theta=None, theta=None):
+    """ Get the vertices from a parameterized bounding box.
+
+    y, x ordering where 0,0 is the top left corner.
+    This matches matrix indexing.
+
+    # http://www.mathopenref.com/coordpolygonarea.html
+    # https://stackoverflow.com/a/45268241/99379
+    """
+    if theta is not None:
+        sin_theta = np.sin(theta)
+        cos_theta = np.cos(theta)
+    # normalizing because this may be using the output of the neural network,
+    # so we turn it into an x y coordinate on the unit circle without changing
+    # the vector.
+    sin_theta, cos_theta = normalize_sin_theta_cos_theta(sin_theta, cos_theta)
+
+    dx = w/2
+    dy = h/2
+    dxcos = dx * cos_theta
+    dxsin = dx * sin_theta
+    dycos = dy * cos_theta
+    dysin = dy * sin_theta
+    return [
+        np.array([cy, cx]) + np.array([-dxsin + -dycos, -dxcos - -dysin]),
+        np.array([cy, cx]) + np.array([ dxsin + -dycos,  dxcos - -dysin]),
+        np.array([cy, cx]) + np.array([ dxsin +  dycos,  dxcos -  dysin]),
+        np.array([cy, cx]) + np.array([-dxsin +  dycos, -dxcos -  dysin])
+    ]
+
+
+def encode_sin2_cos2(sin2_cos2):
+    """ Converts values from the range (-1, 1) to the range (0, 1).
+
+    The value passed is already expected to be in the format:
+        np.array([np.sin(2 * theta), np.cos(2 * theta)])
+
+    If you have 2 theta and want to encode that see `encode_2theta()`.
+
+    """
+    return (sin2_cos2 / 2.0) + 0.5
+
+
+def encode_sin_cos(sin_cos):
+    """ Converts values from the range (-1, 1) to the range (0, 1).
+
+    The value passed is already expected to be in the format:
+        np.array([np.sin(theta), np.cos(theta)])
+
+    If you have theta and want to encode that see `encode_theta()`.
+
+    """
+    return (sin_cos / 2.0) + 0.5
+
+
+def encode_2theta(theta):
+    """ Encodes theta in radians to handle gripper symmetry in 0 to 1 domain
+
+    # Returns
+
+        [sin(2 * theta), cos(2 * theta)] / 2 + 0.5
+
+    """
+    theta2 = theta * 2.0
+    return encode_theta(theta2)
+
+
+def encode_theta(theta):
+    """ Encodes theta in radians to asymmetric grippers in 0 to 1 domain
+
+    # Returns
+
+        [sin(theta), cos(theta)] / 2 + 0.5
+
+    """
+    norm_sin_cos = encode_sin_cos(np.array([np.sin(theta), np.cos(theta)]))
+    return norm_sin_cos
+
+
+def denorm_sin2_cos2(norm_sin2_cos2):
+    """ Undo normalization step of `encode_2theta_np()`
+
+
+        This converts values from the range (0, 1) to (-1, 1)
+        by subtracting 0.5 and multiplying by 2.0.
+        This function does not take any steps to ensure
+        the input obeys the law:
+
+            sin ** 2 + cos ** 2 == 1
+
+        Since the values may have been generated by a neural network
+        it is important to fix this w.r.t. the provided values.
+
+    # Arguments
+
+        norm_sin2_cos2: normalized sin(2*theta) cos(2*theta)
+
+    # Returns
+
+        return actual sin(2*theta) cos(2*theta)
+    """
+    return (norm_sin2_cos2 - 0.5) * 2.0
+
+
+def denorm_sin_cos(norm_sin_cos):
+    """ Undo normalization step of `encode_theta_np()`
+
+
+        This converts values from the range (0, 1) to (-1, 1)
+        by subtracting 0.5 and multiplying by 2.0.
+        This function does not take any steps to ensure
+        the input obeys the law:
+
+            sin ** 2 + cos ** 2 == 1
+
+        Since the values may have been generated by a neural network
+        it is important to fix this w.r.t. the provided values.
+
+    # Arguments
+
+        norm_sin2_cos2: normalized sin(2*theta) cos(2*theta)
+
+    # Returns
+
+        return actual sin(theta) cos(theta)
+    """
+    return (norm_sin_cos - 0.5) * 2.0
+
+
+def decode_sin2_cos2(norm_sin2_cos2):
+    """ Decodes the result of encode_2theta() back into an angle theta in radians.
+    """
+    return decode_sin_cos(norm_sin2_cos2) / 2.0
+
+
+def decode_sin_cos(norm_sin2_cos2):
+    """ Decodes the result of encode_theta() back into an angle theta in radians.
+    """
+    # rescale and shift from (0, 1) range
+    # back to (-1, 1) range
+    #
+    # note that denorm step is the same for both sin_cos and sin2_cos2
+    sin2, cos2 = denorm_sin2_cos2(norm_sin2_cos2)
+    # normalize the values so they are on the unit circle
+    sin2, cos2 = normalize_sin_theta_cos_theta(sin2, cos2)
+    # extract 2x the angle
+    theta2 = np.arctan2(sin2, cos2)
+    # return the angle
+    return theta2
+
+
+def parse_rectangle_vertices(s2t_c2t_hw_cycx):
+    """ Convert a dimensions, angle, grasp center, based rectangle to vertices.
+
+    s2t_c2t_hw_cycx: [sin(2*theta), cos(2*theta), height, width, center x, center y]
+    """
+    # sin(2*theta), cos(2*theta)
+    theta = decode_sin2_cos2(s2t_c2t_hw_cycx[:2])
+    rect_vertices = rectangle_vertices(
+        s2t_c2t_hw_cycx[2],  # height
+        s2t_c2t_hw_cycx[3],  # width
+        s2t_c2t_hw_cycx[4],  # center y
+        s2t_c2t_hw_cycx[5],  # center x
+        theta=theta)
+    return rect_vertices
+
+
+def parse_rectangle_params(s2t_c2t_hw_cycx):
+    rect_vertices = parse_rectangle_vertices(s2t_c2t_hw_cycx)
+    rect_hlines = rectangle_homogeneous_lines(rect_vertices)
+    return rect_vertices, rect_hlines
+
+
+def intersection_over_union(true_rp, pred_rp, true_rl, pred_rl):
+    """ Intersection over union of two oriented rectangles.
+
+    Also known as the jaccard metric.
+
+    # Arguments
+
+        true_rp: oriented rectanle 0 points
+        pred_rp: oriented rectangle 1 points
+        true_rl: oriented rectangle 0 homogeneous lines
+        pred_rl: oriented rectangle 1 homogeneous lines
+    """
+    true_area = polygon_area_four_points(true_rp)
+    pred_area = polygon_area_four_points(pred_rp)
+    intersection_polygon = rectangle_intersection_polygon(true_rp, true_rl, pred_rp, pred_rl)
+    intersection_area = polygon_area(intersection_polygon)
+
+    iou = intersection_area / (true_area + pred_area - intersection_area)
+    return iou
+
+
+def shapely_intersection_over_union(rect0_points, rect1_points, verbose=0):
+    """ Find the intersection over union of two polygons using shapely
+    """
+    # create and clean the polygons to eliminate any overlapping points
+    # https://toblerity.org/shapely/manual.html
+    p0 = Polygon(rect0_points).buffer(0)
+    p1 = Polygon(rect1_points).buffer(0)
+    if p0.is_valid and p1.is_valid:
+        intersection_area = p0.intersection(p1).area
+
+        iou = intersection_area / (p0.area + p1.area - intersection_area)
+        if verbose > 0:
+            print('iou: ' + str(iou))
+        return iou
+    else:
+        # TODO(ahundt) determine and fix the source of invalid polygons.
+        print('Warning: shapely_intersection_over_union() encountered an '
+              'invalid polygon. We will return an IOU of 0 so execution '
+              'might continue, but this bug should be addressed. '
+              'p0: ' + str(p0) + ' p1: ' + str(p1))
+        return 0.0
+
+
+def normalize_sin_theta_cos_theta(sin_theta, cos_theta):
+    """ Put sin(theta) cos(theta) on the unit circle.
+
+    Output values will be in (-1, 1).
+    normalize the prediction but keep the vector direction the same
+    """
+    arr = sklearn.preprocessing.normalize(np.array([[sin_theta, cos_theta]], dtype=np.float))
+    sin_theta = arr[0, 0]
+    cos_theta = arr[0, 1]
+    return sin_theta, cos_theta
+
+
+def prediction_vector_has_grasp_success(y_pred):
+    has_grasp_success = (y_pred.size == 7)
+    return has_grasp_success
+
+
+def get_prediction_vector_rectangle_start_index(y_pred):
+    """ Get the rectangle start index from an encoded prediction vector of length 6 or 7
+    """
+    has_grasp_success = prediction_vector_has_grasp_success(y_pred)
+    # the grasp rectangle start index
+    rect_index = 0
+    if has_grasp_success:
+        rect_index = 1
+    return rect_index
+
+
+def decode_prediction_vector(y_true):
+    """ Decode a prediction vector into sin(2 * theta), cos(2 * theta), and 4 vertices
+    """
+    rect_index = get_prediction_vector_rectangle_start_index(y_true)
+    end_angle_index = rect_index + 2
+    y_true[rect_index: end_angle_index] = denorm_sin2_cos2(y_true[rect_index:end_angle_index])
+    true_y_sin_theta, true_x_cos_theta = y_true[rect_index:end_angle_index]
+    true_rp = parse_rectangle_vertices(y_true[rect_index:])
+    return true_y_sin_theta, true_x_cos_theta, true_rp
+
+
+def decode_prediction_vector_theta_center_polygon(y_true):
+    """ Decode a prediction vector into theta and four rectangle vertices
+
+    Only supports vector format that includes center information!
+    """
+    rect_index = get_prediction_vector_rectangle_start_index(y_true)
+    end_angle_index = rect_index + 2
+    y_true[rect_index: end_angle_index] = denorm_sin2_cos2(y_true[rect_index:end_angle_index])
+    true_y_sin_theta, true_x_cos_theta = y_true[rect_index:end_angle_index]
+    true_rp = parse_rectangle_vertices(y_true[rect_index:])
+    true_y_sin_theta, true_x_cos_theta = normalize_sin_theta_cos_theta(true_y_sin_theta, true_x_cos_theta)
+    # right now it is 2 theta, so get theta
+    theta = np.arctan2(true_y_sin_theta, true_x_cos_theta) / 2.0
+    # center should be last two entries y, x order
+    center = y_true[-2:]
+    return theta, center, true_rp
+
+
+def angle_difference_less_than_threshold(
+        true_y_sin_theta, true_x_cos_theta,
+        pred_y_sin_theta, pred_x_cos_theta,
+        angle_threshold=np.radians(60.0),
+        verbose=0):
+    """ Returns true if the angle difference is less than the threshold, false otherwise.
+
+    Recall that angle differences are around a circle, so the shortest angular difference
+    may be in +theta or the -theta direction with wrapping around the boundaries.
+
+    Note that the angle threshold is set to 60 because we are working with 2*theta.
+    TODO(ahundt) double check the implications of this.
+
+    # Arguments
+        angle_threshold: The maximum absolute angular difference permitted.
+    """
+    # print('ad0 ' + str(true_y_sin_theta) + ' cos: ' + str(true_x_cos_theta))
+    # normalize the prediction but keep the vector direction the same
+    true_y_sin_theta, true_x_cos_theta = normalize_sin_theta_cos_theta(true_y_sin_theta, true_x_cos_theta)
+    # print('ad1')
+    true_angle = np.arctan2(true_y_sin_theta, true_x_cos_theta)
+    # print('ad2')
+    # normalize the prediction but keep the vector direction the same
+    pred_y_sin_theta, pred_x_cos_theta = normalize_sin_theta_cos_theta(pred_y_sin_theta, pred_x_cos_theta)
+    pred_angle = np.arctan2(pred_y_sin_theta, pred_x_cos_theta)
+    # print('pred angle: ' + str(pred_angle) + ' true angle: ' + str(true_angle))
+    true_pred_diff = true_angle - pred_angle
+    # we would have just done this directly at the start if the angle_multiplier wasn't needed
+    angle_difference = np.arctan2(np.sin(true_pred_diff), np.cos(true_pred_diff))
+    # print('angle_difference: ' + str(angle_difference) + ' deg: ' + str(np.degrees(angle_difference)))
+    is_within_angle_threshold = np.abs(angle_difference) <= angle_threshold
+    if verbose > 0:
+        print(' angle_difference_less_than_threshold(): ' +
+              ' angle_difference: ' + str(int(np.degrees(angle_difference))) +
+              ' threshold: ' + str(int(np.degrees(angle_threshold))) +
+              ' is_within_angle_threshold: ' + str(is_within_angle_threshold) +
+              ' true_angle: ' + str(np.degrees(true_angle)) +
+              ' pred_angle: ' + str(np.degrees(pred_angle)) +
+              ' units: degrees ')
+    return is_within_angle_threshold
+
+
+def jaccard_score(y_true, y_pred, angle_threshold=np.radians(60.0), iou_threshold=0.25, verbose=0):
+    """ Scoring for regression
+    Note that the angle threshold is set to 60 because we are working with 2*theta.
+    TODO(ahundt) double check the implications of this.
+
+    # Arguments
+
+        Feature formats accepted:
+
+        grasp_success_norm_sin2_cos2_hw_yx_7:
+            [grasp_success, sin_2theta, cos2_theta, height, width, center_y, center_x]
+            [            0,         1,         2,      3,     4,        5,        6]
+
+        norm_sin2_cos2_hw_yx_6:
+            [sin_2theta, cos2_theta, height, width, center_y, center_x]
+            [            0,         1,         2,      3,     4,        5,        6]
+
+
+        Not yet accepted:
+        norm_sin2_cos2_hw_5
+            [sin2_theta, cos_2theta, height, width, center_y, center_x]
+            [        0,           1,      2,     3,        4,        5]
+
+        grasp_success_norm_sin2_cos2_hw_5
+            [grasp_success, sin_2theta, cos2_theta, height, width]
+            [            0,         1,         2,      3,     4,]
+
+
+        y_true: a numpy array of features
+        y_pred: a numpy array of features
+        angle_threshold: The maximum allowed difference in
+            angles for a grasp to be considered successful.
+            Default of 60 degrees is for 2 * theta, which is 30 degrees for theta.
+        theta_multiplier: Either 1.0 or 2.0.
+            If it is 1.0 theta angles are compared directly.
+            If it is 2.0 (the default), angles that are off by 180 degrees
+            are considered equal, which is the case for a gripper with two plates.
+
+
+    """
+
+    has_grasp_success = prediction_vector_has_grasp_success(y_pred)
+
+    # print('0')
+    # round grasp success to 0 or 1
+    # note this is not valid and not used if
+    # has grasp success is false.
+    predicted_success = np.rint(y_pred[0])
+    # print('1')
+    if has_grasp_success and predicted_success != int(y_true[0]):
+        # grasp success prediction doesn't match, return 0 score
+        # print('2')
+        return 0.0
+    elif has_grasp_success and predicted_success == 0:
+        # The success prediction correctly matches the ground truth,
+        # plus both are False so this is a true negative.
+        # Any true negative where failure to grasp is predicted correctly
+        # gets credit regardless of box contents
+        # print('3')
+        return 1.0
+    else:
+        # We're looking at a successful grasp and we've correctly predicted grasp_success.
+        # First check if the angles are close enough to matching the angle_threshold.
+        # print('4')
+
+        # denormalize the values from (0, 1) back to (-1, 1 range) and get the array entries
+        true_y_sin_theta, true_x_cos_theta, true_rp = decode_prediction_vector(y_true)
+        pred_y_sin_theta, pred_x_cos_theta, pred_rp = decode_prediction_vector(y_pred)
+
+        # print('5')
+        # if the angle difference isn't close enough to ground truth return 0.0
+        if not angle_difference_less_than_threshold(
+                true_y_sin_theta, true_x_cos_theta,
+                pred_y_sin_theta, pred_x_cos_theta,
+                angle_threshold,
+                verbose=verbose):
+            return 0.0
+
+        # print('6')
+        # We passed all the other checks so
+        # let's find out if the grasp boxes match
+        # via the jaccard distance.
+        iou = shapely_intersection_over_union(true_rp, pred_rp)
+        if verbose:
+            print('iou: ' + str(iou))
+        # print('8')
+        if iou >= iou_threshold:
+            # passed iou threshold
+            return 1.0
+        else:
+            # didn't meet iou threshold
+            return 0.0
+
+
+def grasp_jaccard_batch(y_true, y_pred, verbose=0):
+    # print('y_true.shape: ' + str(y_true.shape))
+    # print('y_pred.shape: ' + str(y_pred.shape))
+    scores = []
+    for i in range(y_true.shape[0]):
+        # print(' i: ' + str(i))
+        # TODO(ahundt) comment the next few lines when not debugging
+        verbose = 0
+        if np.random.randint(0, 10000) % 10000 == 0:
+            verbose = 1
+            print('')
+            print('')
+            print('grasp_metrics.py sample of ground_truth and prediction:')
+        this_true = y_true[i, :]
+        this_pred = y_pred[i, :]
+        score = jaccard_score(this_true, this_pred, verbose=verbose)
+        if verbose:
+            print('s2t_c2t_hw_cycx_true: ' + str(this_true))
+            print('s2t_c2t_hw_cycx_pred: ' + str(this_pred))
+            print('score:' + str(score))
+        scores += [score]
+    scores = np.array(scores, dtype=np.float32)
+    # print('scores.shape: ' + str(scores.shape))
+    return scores
+
+
+def grasp_jaccard(y_true, y_pred):
+    """ Calculates the jaccard metric score in a manner compatible with tf and keras metrics.
+
+        This is an IOU metric with angle difference and IOU score thresholds.
+
+        Feature formats accepted as a 2d array containing a batch of data ordered as:
+        [grasp_success, sin_2theta, cos_2theta, height, width, center_y, center_x]
+        [            0,         1,         2,      3,     4,        5,        6]
+
+        [sin_2theta, cos_2theta, height, width, center_y, center_x]
+        [        0,         1,      2,     3,        4,        5]
+
+        It is very important to be aware that sin(2*theta) and cos(2*theta) are expected,
+        additionally all coordinates and height/width are normalized by the network's input dimensions.
+    """
+    scores = tf.py_func(func=grasp_jaccard_batch, inp=[y_true, y_pred], Tout=tf.float32, stateful=False)
+    return scores
+
+
+def rotation_to_xyz_theta(rotation, verbose=0):
+    """Convert a rotation to an angle theta
+
+    From above, a rotation to the right should be a positive theta,
+    and a rotation to the left negative theta. The initial pose is with the
+    z axis pointing down, the y axis to the right and the x axis forward.
+
+    This format does not allow for arbitrary rotation commands to be defined,
+    and originates from the costar dataset.
+
+    In the google brain dataset the gripper is only commanded to
+    rotate around a single vertical axis,
+    so you might clearly visualize it, this also happens to
+    approximately match the vector defined by gravity.
+    Furthermore, the original paper had the geometry of the
+    arm joints on which params could easily be extracted,
+    which is not available here. To resolve this discrepancy
+    Here we assume that the gripper generally starts off at a
+    quaternion orientation of approximately [qx=-1, qy=0, qz=0, qw=0].
+    This is equivalent to the angle axis
+    representation of [a=np.pi, x=-1, y=0, z=0],
+    which I'll name default_rot.
+
+    It is also important to note the ambiguity of the
+    angular distance between any current pose
+    and the end pose. This angular distance will
+    always have a positive value so the network
+    could not naturally discriminate between
+    turning left and turning right.
+    For this reason, we use the angular distance
+    from default_rot to define the input angle parameter,
+    and if the angle axis x axis component is > 0
+    we will use theta for rotation,
+    but if the angle axis x axis component is < 0
+    we will use -theta.
+    """
+    # pyquaternion is in xyzw format!
+    aa = Quaternion(rotation)
+    # angle in radians
+    theta = aa.angle
+    if aa.axis[2] < 0:
+        multiply = 1.0
+    else:
+        multiply = -1.0
+    if verbose > 0:
+        print("ANGLE_AXIS_MULTIPLY: ", aa.angle, np.array(aa.axis), multiply)
+    theta *= multiply
+
+    return np.concatenate([aa.axis, [theta]], axis=-1)
+
+
+def normalize_axis(aaxyz, epsilon=1e-5, verbose=0):
+    """ Normalize an axis in angle axis format data.
+
+    If axis is all zeros, epsilon is added to the final axis.
+    """
+    if not np.any(aaxyz):
+        # source: https://stackoverflow.com/a/23567941/99379
+        # we checked if all values are zero, fix missing axis
+        aaxyz[-1] += epsilon
+    arr = sklearn.preprocessing.normalize(np.array([aaxyz], dtype=np.float))
+    aaxyz = np.squeeze(arr[0, :])
+    if verbose:
+        print('normalize_axis: ' + str(aaxyz))
+    return aaxyz
+
+
+def encode_xyz_qxyzw_to_xyz_aaxyz_nsc(xyz_qxyzw, rescale_meters=4, rotation_weight=1, random_augmentation=None):
+    """ Encode a translation + quaternion pose to an encoded xyz, axis, and an angle as sin(theta) cos(theta)
+
+    rescale_meters: Divide the number of meters by this number so
+        positions will be encoded between 0 and 1.
+        For example if you want to be able to reach forward and back by 2 meters, divide by 4.
+    rotation_weight: scale down rotation values by this factor to a smaller range
+        so mse gives similar weight to both rotations and translations.
+        Use 1.0 for no adjustment. Default of 0.001 makes 1 radian
+        about equal weight to 1 millimeter.
+    random_augmentation: default None means no data modification,
+        otherwise a value between 0.0 and 1.0 for the probability
+        of randomly modifying the data with a small translation and rotation.
+        Enabling random_augmentation is not recommended.
+    """
+    xyz = (xyz_qxyzw[:3] / rescale_meters) + 0.5
+    length = len(xyz_qxyzw)
+    if length == 7:
+        # print('xyz: ' + str(xyz))
+        rotation = Quaternion(xyz_qxyzw[3:])
+        # pose augmentation with no feedback or correspondingly adjusted transform poses
+        if random_augmentation is not None and np.random.random() > random_augmentation:
+            # random rotation change
+            # random = Quaternion.random()
+            # # only take rotations less than 5 degrees
+            # while random.angle > np.pi / 36.:
+            #     # TODO(ahundt) make more efficient and re-enable
+            #     random = Quaternion.random()
+            # rotation = rotation * random
+            # random translation change of up to 0.5 cm
+            random = (np.random.random(3) - 0.5) / 10.
+            xyz = xyz + random
+
+        aaxyz_theta = rotation_to_xyz_theta(rotation)
+        # encode the unit axis vector into the [0,1] range
+        # rotation_weight makes it so mse applied to rotation values
+        # is on a similar scale to the translation values.
+        aaxyz = ((aaxyz_theta[:-1] / 2) * rotation_weight) + 0.5
+        nsc = encode_theta(aaxyz_theta[-1])
+        # print('nsc: ' + str(nsc))
+        xyz_aaxyz_nsc = np.concatenate([xyz, aaxyz, nsc], axis=-1)
+        return xyz_aaxyz_nsc
+    elif length == 3:
+        if random_augmentation is not None and np.random.random() > random_augmentation:
+            # random translation change of up to 0.5 cm
+            random = (np.random.random(3) - 0.5) / 10.
+            xyz = xyz + random
+
+        return xyz
+    else:
+        raise ValueError('encode_xyz_qxyzw_to_xyz_aaxyz_nsc: unsupported input data length of ' + str(length))
+
+
+def batch_encode_xyz_qxyzw_to_xyz_aaxyz_nsc(batch_xyz_qxyzw, rescale_meters=4, rotation_weight=1, random_augmentation=None):
+    """ Expects n by 7 batch with xyz_qxyzw
+
+    rescale_meters: Divide the number of meters by this number so
+        positions will be encoded between 0 and 1.
+        For example if you want to be able to reach forward and back by 2 meters, divide by 4.
+    rotation_weight: scale down rotation values by this factor to a smaller range
+        so mse gives similar weight to both rotations and translations.
+        Use 1.0 for no adjustment.
+    random_augmentation: default None means no data modification,
+        otherwise a value between 0.0 and 1.0 for the probability
+        of randomly modifying the data with a small translation and rotation.
+        Enabling random_augmentation is not recommended.
+    """
+    encoded_poses = []
+    for xyz_qxyzw in batch_xyz_qxyzw:
+        # print('xyz_qxyzw: ' + str(xyz_qxyzw))
+        xyz_aaxyz_nsc = encode_xyz_qxyzw_to_xyz_aaxyz_nsc(
+            xyz_qxyzw, rescale_meters=rescale_meters, rotation_weight=rotation_weight, random_augmentation=random_augmentation)
+        # print('xyz_aaxyz_nsc: ' + str(xyz_aaxyz_nsc))
+        encoded_poses.append(xyz_aaxyz_nsc)
+    return np.stack(encoded_poses, axis=0)
+
+
+def decode_xyz_aaxyz_nsc_to_xyz_qxyzw(xyz_aaxyz_nsc, rescale_meters=4, rotation_weight=1):
+    """ Encode a translation + quaternion pose to an encoded xyz, axis, and an angle as sin(theta) cos(theta)
+
+    rescale_meters: Divide the number of meters by this number so
+        positions will be encoded between 0 and 1.
+        For example if you want to be able to reach forward and back by 2 meters, divide by 4.
+    rotation_weight: scale down rotation values by this factor to a smaller range
+        so mse gives similar weight to both rotations and translations.
+        Use 1.0 for no adjustment.
+    """
+    xyz = (xyz_aaxyz_nsc[:3] - 0.5) * rescale_meters
+    length = len(xyz_aaxyz_nsc)
+    if length == 8:
+        theta = decode_sin_cos(xyz_aaxyz_nsc[-2:])
+        # decode ([0, 1] * rotation_weight) range to [-1, 1] range
+        aaxyz = ((xyz_aaxyz_nsc[3:-2] - 0.5) * 2) / rotation_weight
+        # aaxyz is axis component of angle axis format,
+        # Note that rotation_weight is automatically removed by normalization step.
+        aaxyz = normalize_axis(aaxyz)
+        q = Quaternion(axis=aaxyz, angle=theta)
+        xyz_qxyzw = np.concatenate([xyz, q.elements], axis=-1)
+        return xyz_qxyzw
+    elif length != 3:
+        raise ValueError('decode_xyz_aaxyz_nsc_to_xyz_qxyzw: unsupported input data length of ' + str(length))
+    return xyz
+
+
+def grasp_acc(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.01, max_rotation=0.261799):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+    Limits default to 15 degrees and 1cm.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.01 meters, or 1cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 15 degrees in radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_5mm_7_5deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.005, max_rotation=0.1308995):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+    Limits default to 7.5 degrees and 0.5cm.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.005 meters, which is 0.5cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 7.5 degrees, which is 0.1308995 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_1cm_15deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.01, max_rotation=0.261799):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+    Limits default to 15 degrees and 1cm.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.01 meters, which is 1cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 15 degrees in radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_2cm_30deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.02, max_rotation=0.523598):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_4cm_60deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.04, max_rotation=1.047196):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_8cm_120deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.08, max_rotation=2.094392):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_16cm_240deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.16, max_rotation=4.188784):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_32cm_360deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.32, max_rotation=6.2832):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_64cm_360deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.64, max_rotation=6.2832):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_128cm_360deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=1.28, max_rotation=6.2832):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_256cm_360deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=2.56, max_rotation=6.2832):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def grasp_acc_512cm_360deg(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=5.12, max_rotation=6.2832):
+    """ Calculate 3D grasp accuracy for a single result with grasp_accuracy_xyz_aaxyz_nsc encoding.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.02 meters, which is 2cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 30 degrees, which is 0.523598 radians,
+       rotations must be less than this angular distance away.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        grasp_accuracy_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation, max_rotation],
+        [tf.float32], stateful=False,
+        name='py_func/grasp_accuracy_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def cart_error(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate 3D grasp accuracy for a single result
+    grasp_accuracy_xyz_aaxyz_nsc
+    max_translation defaults to 0.1 meters, or 1cm.
+    max_rotation defaults to 15 degrees in radians.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        absolute_cart_distance_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc],
+        [tf.float32], stateful=False,
+        name='py_func/absolute_cart_distance_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def angle_error(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate 3D grasp accuracy for a single result
+    Input format is xyz_aaxyz_nsc.
+    max_translation defaults to 0.1 meters, or 1cm.
+    max_rotation defaults to 15 degrees in radians.
+    """
+    # TODO(ahundt) make a single, simple call for grasp_accuracy_xyz_aaxyz_nsc, no py_func etc
+    [filter_result] = tf.py_func(
+        absolute_angle_distance_xyz_aaxyz_nsc_batch,
+        [y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc],
+        [tf.float32], stateful=False,
+        name='py_func/absolute_angle_distance_xyz_aaxyz_nsc_batch')
+    filter_result.set_shape(y_true_xyz_aaxyz_nsc.get_shape()[0])
+    return filter_result
+
+
+def absolute_angle_distance_xyz_aaxyz_nsc_single(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate 3D grasp accuracy for a single result
+
+    max_translation is 0.01 meters, or 1cm.
+    max_rotation is 15 degrees in radians.
+    Input format is xyz_aaxyz_nsc.
+
+    This version is for a single pair of numpy arrays of length 8.
+    """
+    length = len(y_true_xyz_aaxyz_nsc)
+    if length == 5:
+        # workaround rotation distance only,
+        # just use [0.5, 0.5, 0.5] for translation component
+        # so existing code can be utilized
+        fake_translation = np.array([0.5, 0.5, 0.5])
+        y_true_xyz_aaxyz_nsc = np.concatenate([fake_translation, y_true_xyz_aaxyz_nsc])
+        y_pred_xyz_aaxyz_nsc = np.concatenate([fake_translation, y_pred_xyz_aaxyz_nsc])
+
+    y_true_xyz_qxyzw = decode_xyz_aaxyz_nsc_to_xyz_qxyzw(y_true_xyz_aaxyz_nsc)
+    y_pred_xyz_qxyzw = decode_xyz_aaxyz_nsc_to_xyz_qxyzw(y_pred_xyz_aaxyz_nsc)
+    y_true_q = Quaternion(y_true_xyz_qxyzw[3:])
+    y_pred_q = Quaternion(y_pred_xyz_qxyzw[3:])
+    return Quaternion.absolute_distance(y_true_q, y_pred_q)
+
+
+def absolute_angle_distance_xyz_aaxyz_nsc_batch(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate 3D grasp accuracy for a single result
+    Expects batch of data as an nx8 array. Eager execution / numpy version.
+
+    max_translation defaults to 0.01 meters, or 1cm.
+    max_rotation defaults to 15 degrees in radians.
+    Input format is xyz_aaxyz_nsc.
+    """
+    # print('type of y_true_xyz_aaxyz_nsc: ' + str(type(y_true_xyz_aaxyz_nsc)))
+    accuracies = []
+    for y_true, y_pred in zip(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+        one_accuracy = absolute_angle_distance_xyz_aaxyz_nsc_single(y_true, y_pred)
+        # print('one grasp acc: ' + str(one_accuracy))
+        accuracies.append(one_accuracy)
+    accuracies = np.array(accuracies, np.float32)
+    return accuracies
+
+
+def absolute_cart_distance_xyz_aaxyz_nsc_single(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate cartesian distance of encoded pose
+
+    This version is for a single pair of numpy arrays of length 8.
+    Input format is xyz_aaxyz_nsc.
+    """
+    y_true_xyz_qxyzw = decode_xyz_aaxyz_nsc_to_xyz_qxyzw(y_true_xyz_aaxyz_nsc)
+    y_pred_xyz_qxyzw = decode_xyz_aaxyz_nsc_to_xyz_qxyzw(y_pred_xyz_aaxyz_nsc)
+    # translation distance
+    return np.linalg.norm(y_true_xyz_qxyzw[:3] - y_pred_xyz_qxyzw[:3])
+
+
+def absolute_cart_distance_xyz_aaxyz_nsc_batch(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+    """ Calculate 3D grasp accuracy for a single result
+    Expects batch of data as an nx8 array. Eager execution / numpy version.
+
+    max_translation defaults to 0.01 meters, or 1cm.
+    max_rotation defaults to 15 degrees in radians.
+    """
+    # print('type of y_true_xyz_aaxyz_nsc: ' + str(type(y_true_xyz_aaxyz_nsc)))
+    accuracies = []
+    for y_true, y_pred in zip(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+        one_accuracy = absolute_cart_distance_xyz_aaxyz_nsc_single(y_true, y_pred)
+        # print('one grasp acc: ' + str(one_accuracy))
+        accuracies.append(one_accuracy)
+    accuracies = np.array(accuracies, np.float32)
+    return accuracies
+
+
+def grasp_accuracy_xyz_aaxyz_nsc_single(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.01, max_rotation=0.261799):
+    """ Calculate 3D grasp accuracy for a single 1D numpy array for the ground truth and predicted value.
+
+    Return 1 if the prediction meets both the translation and rotation accuracy criteria, 0 otherwise.
+
+    Supported formats are translation xyz with length 3,
+    aaxyz_nsc which is an axis and normalized sin(theta) cos(theta) with length 5,
+    or xyz_aaxyz_nsc which incorporates both of the above with length 8.
+
+    max_translation: defaults to 0.01 meters, or 1cm,
+       translations must be less than this distance away.
+    max_rotation: defaults to 15 degrees in radians,
+       rotations must be less than this angular distance away.
+    """
+    length = len(y_true_xyz_aaxyz_nsc)
+    if length == 3 or length == 8:
+        # translation distance
+        translation = absolute_cart_distance_xyz_aaxyz_nsc_single(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc)
+    if length == 3:
+        # translation component only
+        if translation < max_translation:
+            return 1.
+    # translation and rotation
+    elif length == 8:
+        # rotation distance
+        angle_distance = absolute_angle_distance_xyz_aaxyz_nsc_single(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc)
+        if angle_distance < max_rotation and translation < max_translation:
+            return 1.
+    elif length == 5:
+        # rotation distance only, just use [0.5, 0.5, 0.5] for translation component so existing code can be utilized
+        fake_translation = np.array([0.5, 0.5, 0.5])
+        angle_distance = absolute_angle_distance_xyz_aaxyz_nsc_single(
+            np.concatenate([fake_translation, y_true_xyz_aaxyz_nsc]),
+            np.concatenate([fake_translation, y_pred_xyz_aaxyz_nsc]))
+        if angle_distance < max_rotation:
+            return 1.
+    else:
+        raise ValueError('grasp_accuracy_xyz_aaxyz_nsc_single: unsupported label value format of length ' + str(length))
+    return 0.
+
+
+def grasp_accuracy_xyz_aaxyz_nsc_batch(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc, max_translation=0.01, max_rotation=0.261799):
+    """ Calculate 3D grasp accuracy for a single result
+    Expects batch of data as an nx8 array. Eager execution / numpy version.
+
+    max_translation defaults to 0.01 meters, or 1cm.
+    max_rotation defaults to 15 degrees in radians.
+    """
+    # print('type of y_true_xyz_aaxyz_nsc: ' + str(type(y_true_xyz_aaxyz_nsc)))
+    accuracies = []
+    for y_true, y_pred in zip(y_true_xyz_aaxyz_nsc, y_pred_xyz_aaxyz_nsc):
+        one_accuracy = grasp_accuracy_xyz_aaxyz_nsc_single(
+            y_true, y_pred, max_translation=max_translation, max_rotation=max_rotation)
+        # print('one grasp acc: ' + str(one_accuracy))
+        accuracies.append(one_accuracy)
+    accuracies = np.array(accuracies, np.float32)
+    return accuracies
diff --git a/enas/cifar10/grasp_utilities.py b/enas/cifar10/grasp_utilities.py
new file mode 100644
index 0000000..2061625
--- /dev/null
+++ b/enas/cifar10/grasp_utilities.py
@@ -0,0 +1,245 @@
+import sys
+import re
+import numpy as np
+import os
+import json
+import datetime
+import errno
+import json
+import six
+
+
+class NumpyEncoder(json.JSONEncoder):
+    """ json encoder for numpy types
+
+    source: https://stackoverflow.com/a/49677241/99379
+    """
+    def default(self, obj):
+        if isinstance(obj,
+            (np.int_, np.intc, np.intp, np.int8,
+             np.int16, np.int32, np.int64, np.uint8,
+             np.uint16, np.uint32, np.uint64)):
+            return int(obj)
+        elif isinstance(obj,
+           (np.float_, np.float16, np.float32,
+            np.float64)):
+            return float(obj)
+        elif isinstance(obj, (np.ndarray,)):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+
+
+def rotate(data, shift=1):
+    """ Rotates indices up 1 for a list or numpy array.
+
+    For example, [0, 1, 2] will become [1, 2, 0] and
+    [4, 3, 1, 0] will become [3, 1, 0, 4].
+    The contents of index 0 becomes the contents of index 1,
+    and the final entry will contain the original contents of index 0.
+    Always operates on axis 0.
+    """
+    if isinstance(data, list):
+        return data[shift:] + data[:shift]
+    else:
+        return np.roll(data, shift, axis=0)
+
+
+def mkdir_p(path):
+    """Create the specified path on the filesystem like the `mkdir -p` command
+
+    Creates one or more filesystem directory levels as needed,
+    and does not return an error if the directory already exists.
+    """
+    # http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def timeStamped(fname, fmt='%Y-%m-%d-%H-%M-%S_{fname}'):
+    """ Apply a timestamp to the front of a filename description.
+
+    see: http://stackoverflow.com/a/5215012/99379
+    """
+    return datetime.datetime.now().strftime(fmt).format(fname=fname)
+
+
+def load_hyperparams_json(hyperparams_file, fine_tuning=False, learning_rate=None, feature_combo_name=None):
+    """ Load hyperparameters from a json file
+
+    # Returns
+
+    Hyperparams
+    """
+    kwargs = {}
+    hyperparams = None
+    if hyperparams_file is not None and hyperparams_file:
+        with open(hyperparams_file, mode='r') as hyperparams:
+            kwargs = json.load(hyperparams)
+            hyperparams = kwargs
+    if fine_tuning:
+        kwargs['trainable'] = True
+        kwargs['learning_rate'] = learning_rate
+        # TODO(ahundt) should we actually write the fine tuning settings out to the hyperparams log?
+        # hyperparams = kwargs
+
+    if (kwargs is not None and feature_combo_name is not None and
+            'feature_combo_name' in kwargs and
+            kwargs['feature_combo_name'] != feature_combo_name):
+        print('Warning: overriding old hyperparam feature_combo_name: %s'
+              ' with new feature_combo_name: %s. This means the network '
+              'structure and inputs will be different from what is defined '
+              'in the hyperparams file: %s' %
+              (kwargs['feature_combo_name'], feature_combo_name, hyperparams_file))
+        kwargs.pop('feature_combo_name')
+        if 'feature_combo_name' in hyperparams:
+            hyperparams.pop('feature_combo_name')
+    return kwargs
+
+
+def is_sequence(arg):
+    """Returns true if arg is a list or another Python Sequence, and false otherwise.
+
+        source: https://stackoverflow.com/a/17148334/99379
+    """
+    return (not hasattr(arg, "strip") and
+            hasattr(arg, "__getitem__") or
+            hasattr(arg, "__iter__"))
+
+
+def find_best_weights(fold_log_dir, match_string='', verbose=0, out_file=sys.stdout):
+    """ Find the best weights file with val_*0.xxx out in a directory
+    """
+    # Now we have to load the best model
+    # '200_epoch_real_run' is for backwards compatibility before
+    # the fold nums were put into each fold's log_dir and run_name.
+    directory_listing = os.listdir(fold_log_dir)
+    fold_checkpoint_files = []
+    for name in directory_listing:
+        name = os.path.join(fold_log_dir, name)
+        if not os.path.isdir(name) and '.h5' in name:
+            if '200_epoch_real_run' in name or match_string in name:
+                fold_checkpoint_files += [name]
+
+    # check the filenames for the highest val score
+    fold_checkpoint_file = None
+    best_val = 0.0
+    for filename in fold_checkpoint_files:
+        if 'val_' in filename:
+            # pull out all the floating point numbers
+            # source: https://stackoverflow.com/a/4703409/99379
+            nums = re.findall(r"[-+]?\d*\.\d+|\d+", filename)
+            if len(nums) > 0:
+                # don't forget about the .h5 at the end...
+                cur_num = np.abs(float(nums[-2]))
+                if verbose > 0:
+                    out_file.write('old best ' + str(best_val) + ' current ' + str(cur_num))
+                if cur_num > best_val:
+                    if verbose > 0:
+                        out_file.write('new best: ' + str(cur_num) + ' file: ' + filename)
+                    best_val = cur_num
+                    fold_checkpoint_file = filename
+
+    if fold_checkpoint_file is None:
+        raise ValueError('\n\nSomething went wrong when looking for model checkpoints, '
+                         'you need to take a look at model_predict_k_fold() '
+                         'in cornell_grasp_train.py. Here are the '
+                         'model checkpoint files we were looking at: \n\n' +
+                         str(fold_checkpoint_files))
+    return fold_checkpoint_file
+
+
+def make_model_description(run_name, model_name, hyperparams, dataset_names_str, label_features=None):
+    """ Put several strings together for a model description used in file and folder names
+    """
+    model_description = ''
+    if run_name:
+        model_description += run_name + '-'
+    if model_name:
+        model_description += model_name + '-'
+
+    # if hyperparams is not None:
+    #     if 'image_model_name' in hyperparams:
+    #         model_description += '_img_' + hyperparams['image_model_name']
+    #     if 'vector_model_name' in hyperparams:
+    #         model_description += '_vec_' + hyperparams['vector_model_name']
+    #     if 'trunk_model_name' in hyperparams:
+    #         model_description += '_trunk_' + hyperparams['trunk_model_name']
+    ########################################################
+    # End tensor configuration, begin model configuration and training
+    model_description += '-dataset_' + dataset_names_str
+
+    if label_features is not None:
+        model_description += '-' + label_features
+
+    run_name = timeStamped(model_description)
+    return run_name
+
+
+def multi_run_histories_summary(
+        run_histories,
+        save_filename=None,
+        metrics='val_binary_accuracy',
+        description_prefix='k_fold_average_',
+        results_prefix='k_fold_results',
+        multi_history_metrics='mean',
+        verbose=1):
+    """ Find the k_fold average of the best model weights on each fold, and save the results.
+
+    This can be used to summarize multiple runs, be they on different models or the same model.
+
+    Please note that currently this should only be utilized with classification models,
+    or regression models with absolute thresholds.
+    it will not calculated grasp_jaccard regression models' scores correctly.
+
+    # Arguments
+
+    run_histories: A dictionary from training run description strings to keras history objects.
+    multi_history_metric: 'mean', 'min', 'max',
+        used to summarize the data from multiple training runs.
+
+    # Returns
+
+    results disctionary including the max value of metric for each fold,
+    plus the average of all folds in a dictionary.
+    """
+    if isinstance(metrics, str):
+        metrics = [metrics]
+    if isinstance(multi_history_metrics, str):
+        multi_history_metrics = [multi_history_metrics]
+    results = {}
+    for metric, multi_history_metric in zip(metrics, multi_history_metrics):
+        best_metric_scores = []
+        for history_description, history_object in six.iteritems(run_histories):
+            if 'loss' in metric or 'error' in metric:
+                best_score = np.min(history_object.history[metric])
+                results[history_description + '_min_' + metric] = best_score
+            else:
+                best_score = np.max(history_object.history[metric])
+                results[history_description + '_max_' + metric] = best_score
+            best_metric_scores += [best_score]
+        if multi_history_metric == 'mean' or multi_history_metric == 'average':
+            k_fold_average = np.mean(best_metric_scores)
+        elif multi_history_metric == 'min':
+            k_fold_average = np.min(best_metric_scores)
+        elif multi_history_metric == 'max':
+            k_fold_average = np.max(best_metric_scores)
+        else:
+            raise ValueError(
+                'multi_run_histories_summary(): Unsupported multi_history_metric: ' +
+                str(multi_history_metric))
+        result_key = description_prefix + '_' + multi_history_metric + '_' + metric
+        results[result_key] = k_fold_average
+
+    if verbose:
+        print(str(results_prefix) + ':\n ' + str(results))
+
+    if save_filename is not None:
+        with open(save_filename, 'w') as fp:
+            # save out all kfold params so they can be reloaded in the future
+            json.dump(results, fp)
+    return results
diff --git a/enas/cifar10/image_ops.py b/enas/cifar10/image_ops.py
index ea10547..afe0613 100644
--- a/enas/cifar10/image_ops.py
+++ b/enas/cifar10/image_ops.py
@@ -1,6 +1,7 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.training import moving_averages
+import traceback
 
 from enas.common_ops import create_weight
 from enas.common_ops import create_bias
@@ -89,6 +90,16 @@ def global_avg_pool(x, data_format="NHWC"):
   return x
 
 
+def global_max_pool(x, data_format="NHWC"):
+  if data_format == "NHWC":
+    x = tf.reduce_max(x, [1, 2])
+  elif data_format == "NCHW":
+    x = tf.reduce_max(x, [2, 3])
+  else:
+    raise NotImplementedError("Unknown data_format {}".format(data_format))
+  return x
+
+
 def batch_norm(x, is_training, name="bn", decay=0.9, epsilon=1e-5,
                data_format="NHWC"):
   if data_format == "NHWC":
@@ -130,6 +141,85 @@ def batch_norm(x, is_training, name="bn", decay=0.9, epsilon=1e-5,
   return x
 
 
+def norm(x, is_training, name=None, decay=0.9, epsilon=1e-5, data_format="NHWC", norm_type='group', G=32, verbose=0):
+  """ Perform batch normalization or group normalization, depending on norm_type argument.
+  norm_type: options include, none, batch, and group.
+  reference: https://github.com/shaohua0116/Group-Normalization-Tensorflow
+  """
+  shape_list = x.get_shape().as_list()
+  if verbose > 0:
+    print('-' * 80)
+    print('group_norm input x shape outside scope: ' + str(shape_list) + ' data_format: ' + str(data_format))
+    for line in traceback.format_stack():
+        print(line.strip())
+  if data_format == "NHWC":
+    c_shape = [x.get_shape()[3]]
+  elif data_format == "NCHW":
+    c_shape = [x.get_shape()[1]]
+  else:
+    raise NotImplementedError("Unknown data_format {}".format(data_format))
+  if name is None:
+      name = norm_type + '_norm'
+  with tf.variable_scope(name, reuse=None if is_training else True):
+
+    if norm_type == 'none':
+      output = x
+    elif norm_type == 'batch':
+      output = batch_norm(
+        x=x, is_training=is_training, name=name,
+        decay=decay, epsilon=epsilon, data_format=data_format)
+    elif norm_type == 'group':
+      # normalize
+      # tranpose: [bs, h, w, c] to [bs, c, h, w] following the paper
+      # print('group_norm input x shape inside scope: ' + str(x.get_shape().as_list()))
+      if data_format == "NHWC":
+          x = tf.transpose(x, [0, 3, 1, 2])
+          # c_shape = [x.get_shape()[3]]
+          # channels_axis=-1, reduction_axes=[-3, -2]
+      elif data_format == "NCHW":
+          pass
+          # already in the right format
+          # c_shape = [x.get_shape()[1]]
+          # channels_axis=-3, reduction_axes=[-2, -1]
+      else:
+        raise NotImplementedError("Unknown data_format {}".format(data_format))
+      shape = tf.shape(x)
+      N = shape[0]
+      C = shape[1]
+      H = shape[2]
+      W = shape[3]
+      G = tf.minimum(G, C)
+      x = tf.reshape(x, [N, G, C // G, H, W])
+      mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True)
+      x = (x - mean) / tf.sqrt(var + epsilon)
+      # per channel gamma and beta
+      gamma = tf.get_variable('gamma', c_shape,
+                              initializer=tf.constant_initializer(1.0, dtype=tf.float32))
+      beta = tf.get_variable('beta', c_shape,
+                             initializer=tf.constant_initializer(0.0, dtype=tf.float32))
+      gamma = tf.reshape(gamma, [1, C, 1, 1])
+      beta = tf.reshape(beta, [1, C, 1, 1])
+
+      output = tf.reshape(x, [N, C, H, W]) * gamma + beta
+
+      if data_format == "NHWC":
+          # tranpose: [bs, c, h, w, c] to [bs, h, w, c] following the paper
+          output = tf.transpose(output, [0, 2, 3, 1])
+      elif data_format == "NCHW":
+          # already in the right format
+          pass
+      else:
+        raise NotImplementedError("Unknown data_format {}".format(data_format))
+      # recover initial shape information
+      if shape_list[0] is None:
+        # first index is batch, that should be inferred
+        shape_list[0] = -1
+      output = tf.reshape(output, shape_list)
+    else:
+        raise NotImplementedError
+  return output
+
+
 def batch_norm_with_mask(x, is_training, mask, num_channels, name="bn",
                          decay=0.9, epsilon=1e-3, data_format="NHWC"):
 
diff --git a/enas/cifar10/main.py b/enas/cifar10/main.py
index 644e5c7..26be2b3 100644
--- a/enas/cifar10/main.py
+++ b/enas/cifar10/main.py
@@ -5,9 +5,9 @@
 import os
 
 try:
-  import cPickle as pickle
+    import cPickle as pickle
 except ImportError:
-  import _pickle as pickle
+    import _pickle as pickle
 
 import shutil
 import sys
@@ -33,22 +33,36 @@
 from enas.cifar10.micro_controller import MicroController
 from enas.cifar10.micro_child import MicroChild
 
+
 flags = tf.app.flags
 FLAGS = flags.FLAGS
 
 DEFINE_boolean("reset_output_dir", False, "Delete output_dir if exists.")
 DEFINE_string("data_path", "", "")
+DEFINE_string("data_base_path", "~/.keras/datasets/costar_block_stacking_dataset_v0.3/", "")
 DEFINE_string("output_dir", "", "")
 DEFINE_string("data_format", "NHWC", "'NHWC' or 'NCWH'")
-DEFINE_string("dataset", "cifar", "'cifar' or 'fmnist'")
+DEFINE_string("dataset", "cifar", "'cifar' or 'fmnist' or 'stacking'")
 DEFINE_string("search_for", None, "Must be [macro|micro]")
 
 DEFINE_integer("batch_size", 32, "")
+DEFINE_integer("valid_set_size", 128, "")
+DEFINE_integer("height_img", 32, "")
+DEFINE_integer("width_img", 32, "")
+DEFINE_boolean("regression", False, "Task is regression or classification")
+DEFINE_boolean("translation_only", False, "Translation only case")
+DEFINE_boolean("use_root", False, "Process image and vector and then tile")
+DEFINE_boolean("one_hot_encoding", False, "Use one hot encoding for labels (only for stacking dataset)")
+DEFINE_boolean("rotation_only", False, "Rotation only case")
+DEFINE_boolean("stacking_reward", False, "Train a block stacking critic which estimates the reward of a proposed action based on the current state and command.")
+DEFINE_integer("max_loss", 0, "To set positive reward; for stacking dataset only")
+DEFINE_boolean("use_msle", False, "Use Mean Square Logarithmic Error as Loss")
 
 DEFINE_integer("num_epochs", 300, "")
 DEFINE_integer("child_lr_dec_every", 100, "")
-DEFINE_integer("child_num_layers", 5, "")
-DEFINE_integer("child_num_cells", 5, "")
+DEFINE_integer("child_num_layers", 5, "number of layer blocks")
+DEFINE_integer("child_num_cells", 5, "number of cells in a single layer")
+DEFINE_integer("child_pool_distance", 2, "number of layers between each pooling step, which reduces the resolution")
 DEFINE_integer("child_filter_size", 5, "")
 DEFINE_integer("child_out_filters", 48, "")
 DEFINE_integer("child_out_filters_scale", 1, "")
@@ -72,6 +86,7 @@
 DEFINE_boolean("child_use_aux_heads", False, "Should we use an aux head")
 DEFINE_boolean("child_sync_replicas", False, "To sync or not to sync.")
 DEFINE_boolean("child_lr_cosine", False, "Use cosine lr schedule")
+DEFINE_string("child_optimizer", "momentum", "Optimization algorithm, one of sgd, momentum or adam")
 
 DEFINE_float("controller_lr", 1e-3, "")
 DEFINE_float("controller_lr_dec_rate", 1.0, "")
@@ -94,274 +109,414 @@
 DEFINE_boolean("controller_sync_replicas", False, "To sync or not to sync.")
 DEFINE_boolean("controller_training", True, "")
 DEFINE_boolean("controller_use_critic", False, "")
+DEFINE_string("controller_optimizer", "adam", "Optimization algorithm, one of sgd, momentum or adam")
 
 DEFINE_integer("log_every", 50, "How many steps to log")
 DEFINE_integer("eval_every_epochs", 1, "How many epochs to eval")
 
+flags.DEFINE_float(
+    'random_augmentation',
+    None,
+    'Frequency from 0.0 to 1.0 with which random augmentation is performed. '
+    'Disabled by default and currently for block stacking dataset only.'
+)
+
+
 def get_ops(images, labels):
-  """
-  Args:
-    images: dict with keys {"train", "valid", "test"}.
-    labels: dict with keys {"train", "valid", "test"}.
-  """
-
-  assert FLAGS.search_for is not None, "Please specify --search_for"
-
-  if FLAGS.search_for == "micro":
-    ControllerClass = MicroController
-    ChildClass = MicroChild
-  else:
-    ControllerClass = GeneralController
-    ChildClass = GeneralChild
-
-  child_model = ChildClass(
-    images,
-    labels,
-    use_aux_heads=FLAGS.child_use_aux_heads,
-    cutout_size=FLAGS.child_cutout_size,
-    whole_channels=FLAGS.controller_search_whole_channels,
-    num_layers=FLAGS.child_num_layers,
-    num_cells=FLAGS.child_num_cells,
-    num_branches=FLAGS.child_num_branches,
-    fixed_arc=FLAGS.child_fixed_arc,
-    out_filters_scale=FLAGS.child_out_filters_scale,
-    out_filters=FLAGS.child_out_filters,
-    keep_prob=FLAGS.child_keep_prob,
-    drop_path_keep_prob=FLAGS.child_drop_path_keep_prob,
-    num_epochs=FLAGS.num_epochs,
-    l2_reg=FLAGS.child_l2_reg,
-    data_format=FLAGS.data_format,
-    batch_size=FLAGS.batch_size,
-    clip_mode="norm",
-    grad_bound=FLAGS.child_grad_bound,
-    lr_init=FLAGS.child_lr,
-    lr_dec_every=FLAGS.child_lr_dec_every,
-    lr_dec_rate=FLAGS.child_lr_dec_rate,
-    lr_cosine=FLAGS.child_lr_cosine,
-    lr_max=FLAGS.child_lr_max,
-    lr_min=FLAGS.child_lr_min,
-    lr_T_0=FLAGS.child_lr_T_0,
-    lr_T_mul=FLAGS.child_lr_T_mul,
-    optim_algo="momentum",
-    sync_replicas=FLAGS.child_sync_replicas,
-    num_aggregate=FLAGS.child_num_aggregate,
-    num_replicas=FLAGS.child_num_replicas,
-  )
-
-  if FLAGS.child_fixed_arc is None:
-    controller_model = ControllerClass(
-      search_for=FLAGS.search_for,
-      search_whole_channels=FLAGS.controller_search_whole_channels,
-      skip_target=FLAGS.controller_skip_target,
-      skip_weight=FLAGS.controller_skip_weight,
-      num_cells=FLAGS.child_num_cells,
-      num_layers=FLAGS.child_num_layers,
-      num_branches=FLAGS.child_num_branches,
-      out_filters=FLAGS.child_out_filters,
-      lstm_size=64,
-      lstm_num_layers=1,
-      lstm_keep_prob=1.0,
-      tanh_constant=FLAGS.controller_tanh_constant,
-      op_tanh_reduce=FLAGS.controller_op_tanh_reduce,
-      temperature=FLAGS.controller_temperature,
-      lr_init=FLAGS.controller_lr,
-      lr_dec_start=0,
-      lr_dec_every=1000000,  # never decrease learning rate
-      l2_reg=FLAGS.controller_l2_reg,
-      entropy_weight=FLAGS.controller_entropy_weight,
-      bl_dec=FLAGS.controller_bl_dec,
-      use_critic=FLAGS.controller_use_critic,
-      optim_algo="adam",
-      sync_replicas=FLAGS.controller_sync_replicas,
-      num_aggregate=FLAGS.controller_num_aggregate,
-      num_replicas=FLAGS.controller_num_replicas)
-
-    child_model.connect_controller(controller_model)
-    controller_model.build_trainer(child_model)
-
-    controller_ops = {
-      "train_step": controller_model.train_step,
-      "loss": controller_model.loss,
-      "train_op": controller_model.train_op,
-      "lr": controller_model.lr,
-      "grad_norm": controller_model.grad_norm,
-      "valid_acc": controller_model.valid_acc,
-      "optimizer": controller_model.optimizer,
-      "baseline": controller_model.baseline,
-      "entropy": controller_model.sample_entropy,
-      "sample_arc": controller_model.sample_arc,
-      "skip_rate": controller_model.skip_rate,
+    """
+    Args:
+      images: dict with keys {"train", "valid", "test"}.
+      labels: dict with keys {"train", "valid", "test"}.
+    """
+
+    assert FLAGS.search_for is not None, "Please specify --search_for"
+
+    if FLAGS.search_for == "micro":
+        ControllerClass = MicroController
+        ChildClass = MicroChild
+    else:
+        ControllerClass = GeneralController
+        ChildClass = GeneralChild
+
+    child_model = ChildClass(
+        images,
+        labels,
+        use_aux_heads=FLAGS.child_use_aux_heads,
+        cutout_size=FLAGS.child_cutout_size,
+        whole_channels=FLAGS.controller_search_whole_channels,
+        num_layers=FLAGS.child_num_layers,
+        num_cells=FLAGS.child_num_cells,
+        num_branches=FLAGS.child_num_branches,
+        fixed_arc=FLAGS.child_fixed_arc,
+        out_filters_scale=FLAGS.child_out_filters_scale,
+        out_filters=FLAGS.child_out_filters,
+        keep_prob=FLAGS.child_keep_prob,
+        drop_path_keep_prob=FLAGS.child_drop_path_keep_prob,
+        num_epochs=FLAGS.num_epochs,
+        l2_reg=FLAGS.child_l2_reg,
+        data_format=FLAGS.data_format,
+        batch_size=FLAGS.batch_size,
+        clip_mode="norm",
+        grad_bound=FLAGS.child_grad_bound,
+        lr_init=FLAGS.child_lr,
+        lr_dec_every=FLAGS.child_lr_dec_every,
+        lr_dec_rate=FLAGS.child_lr_dec_rate,
+        lr_cosine=FLAGS.child_lr_cosine,
+        lr_max=FLAGS.child_lr_max,
+        lr_min=FLAGS.child_lr_min,
+        lr_T_0=FLAGS.child_lr_T_0,
+        lr_T_mul=FLAGS.child_lr_T_mul,
+        optim_algo=FLAGS.child_optimizer,
+        sync_replicas=FLAGS.child_sync_replicas,
+        num_aggregate=FLAGS.child_num_aggregate,
+        num_replicas=FLAGS.child_num_replicas,
+        valid_set_size=FLAGS.valid_set_size,
+        image_shape=(FLAGS.height_img, FLAGS.width_img, 3),
+        translation_only=FLAGS.translation_only,
+        rotation_only=FLAGS.rotation_only,
+        stacking_reward=FLAGS.stacking_reward,
+        use_root=FLAGS.use_root,
+        dataset=FLAGS.dataset,
+        data_base_path=FLAGS.data_base_path,
+        output_dir=FLAGS.output_dir,
+        pool_distance=FLAGS.child_pool_distance,
+        one_hot_encoding=FLAGS.one_hot_encoding,
+        use_msle=FLAGS.use_msle,
+        random_augmentation=FLAGS.random_augmentation
+    )
+    if FLAGS.child_fixed_arc is None:
+        controller_model = ControllerClass(
+            search_for=FLAGS.search_for,
+            search_whole_channels=FLAGS.controller_search_whole_channels,
+            skip_target=FLAGS.controller_skip_target,
+            skip_weight=FLAGS.controller_skip_weight,
+            num_cells=FLAGS.child_num_cells,
+            num_layers=FLAGS.child_num_layers,
+            num_branches=FLAGS.child_num_branches,
+            out_filters=FLAGS.child_out_filters,
+            lstm_size=64,
+            lstm_num_layers=1,
+            lstm_keep_prob=1.0,
+            tanh_constant=FLAGS.controller_tanh_constant,
+            op_tanh_reduce=FLAGS.controller_op_tanh_reduce,
+            temperature=FLAGS.controller_temperature,
+            lr_init=FLAGS.controller_lr,
+            lr_dec_start=0,
+            lr_dec_every=1000000,  # never decrease learning rate
+            l2_reg=FLAGS.controller_l2_reg,
+            entropy_weight=FLAGS.controller_entropy_weight,
+            bl_dec=FLAGS.controller_bl_dec,
+            use_critic=FLAGS.controller_use_critic,
+            optim_algo=FLAGS.controller_optimizer,
+            sync_replicas=FLAGS.controller_sync_replicas,
+            num_aggregate=FLAGS.controller_num_aggregate,
+            num_replicas=FLAGS.controller_num_replicas,
+            max_loss=FLAGS.max_loss,
+            dataset=FLAGS.dataset)
+
+        child_model.connect_controller(controller_model)
+        controller_model.build_trainer(child_model)
+
+        controller_ops = {
+            "train_step": controller_model.train_step,
+            "loss": controller_model.loss,
+            "train_op": controller_model.train_op,
+            "lr": controller_model.lr,
+            "grad_norm": controller_model.grad_norm,
+            "valid_acc": controller_model.valid_acc,
+            "optimizer": controller_model.optimizer,
+            "baseline": controller_model.baseline,
+            "entropy": controller_model.sample_entropy,
+            "sample_arc": controller_model.sample_arc,
+            "skip_rate": controller_model.skip_rate,
+            "reward": controller_model.reward,
+            "mse": controller_model.mse,
+            "cart_error": controller_model.cart_error,
+            "angle_error": controller_model.angle_error,
+            "mae": controller_model.mae,
+            # "g_emb": controller_model.g_emb,
+        }
+    else:
+        assert not FLAGS.controller_training, (
+            "--child_fixed_arc is given, cannot train controller")
+        child_model.connect_controller(None)
+        controller_ops = None
+
+    child_ops = {
+        "global_step": child_model.global_step,
+        "loss": child_model.loss,
+        "loss_sec": child_model.loss_secondary,
+        "train_op": child_model.train_op,
+        "lr": child_model.lr,
+        "grad_norm": child_model.grad_norm,
+        "train_acc": child_model.train_acc,
+        "train_acc_5mm_7_5deg": child_model.train_acc_5mm_7_5deg,
+        "train_acc_1cm_15deg": child_model.train_acc_1cm_15deg,
+        "train_acc_2cm_30deg": child_model.train_acc_2cm_30deg,
+        "train_acc_4cm_60deg": child_model.train_acc_4cm_60deg,
+        "train_acc_8cm_120deg": child_model.train_acc_8cm_120deg,
+        "train_acc_16cm_240deg": child_model.train_acc_16cm_240deg,
+        "train_acc_32cm_360deg": child_model.train_acc_32cm_360deg,
+        "optimizer": child_model.optimizer,
+        "num_train_batches": child_model.num_train_batches,
+        "train_angle_error": child_model.train_angle_error,
+        "train_cart_error": child_model.train_cart_error,
+        "train_mae": child_model.train_mae,
+        "train_preds": child_model.train_preds[0],
+        "train_label": child_model.y_train[0],
     }
-  else:
-    assert not FLAGS.controller_training, (
-      "--child_fixed_arc is given, cannot train controller")
-    child_model.connect_controller(None)
-    controller_ops = None
-
-  child_ops = {
-    "global_step": child_model.global_step,
-    "loss": child_model.loss,
-    "train_op": child_model.train_op,
-    "lr": child_model.lr,
-    "grad_norm": child_model.grad_norm,
-    "train_acc": child_model.train_acc,
-    "optimizer": child_model.optimizer,
-    "num_train_batches": child_model.num_train_batches,
-  }
-
-  ops = {
-    "child": child_ops,
-    "controller": controller_ops,
-    "eval_every": child_model.num_train_batches * FLAGS.eval_every_epochs,
-    "eval_func": child_model.eval_once,
-    "num_train_batches": child_model.num_train_batches,
-  }
-
-  return ops
 
+    ops = {
+        "child": child_ops,
+        "controller": controller_ops,
+        "eval_every": child_model.num_train_batches * FLAGS.eval_every_epochs,
+        "eval_func": child_model.eval_once,
+        "num_train_batches": child_model.num_train_batches,
+    }
 
-def train():
-  if FLAGS.child_fixed_arc is None:
-    images, labels = read_data(FLAGS.data_path, dataset = FLAGS.dataset)
-  else:
-    images, labels = read_data(FLAGS.data_path, num_valids = 0, dataset = FLAGS.dataset)
-
-  g = tf.Graph()
-  with g.as_default():
-    ops = get_ops(images, labels)
-    child_ops = ops["child"]
-    controller_ops = ops["controller"]
-
-    saver = tf.train.Saver(max_to_keep=2)
-    checkpoint_saver_hook = tf.train.CheckpointSaverHook(
-      FLAGS.output_dir, save_steps=child_ops["num_train_batches"], saver=saver)
-
-    hooks = [checkpoint_saver_hook]
-    if FLAGS.child_sync_replicas:
-      sync_replicas_hook = child_ops["optimizer"].make_session_run_hook(True)
-      hooks.append(sync_replicas_hook)
-    if FLAGS.controller_training and FLAGS.controller_sync_replicas:
-      sync_replicas_hook = controller_ops["optimizer"].make_session_run_hook(True)
-      hooks.append(sync_replicas_hook)
+    return ops
 
-    print("-" * 80)
-    print("Starting session")
-    config = tf.ConfigProto(allow_soft_placement=True)
-    with tf.train.SingularMonitoredSession(
-      config=config, hooks=hooks, checkpoint_dir=FLAGS.output_dir) as sess:
-        start_time = time.time()
-        while True:
-          run_ops = [
-            child_ops["loss"],
-            child_ops["lr"],
-            child_ops["grad_norm"],
-            child_ops["train_acc"],
-            child_ops["train_op"],
-          ]
-          loss, lr, gn, tr_acc, _ = sess.run(run_ops)
-          global_step = sess.run(child_ops["global_step"])
-
-          if FLAGS.child_sync_replicas:
-            actual_step = global_step * FLAGS.num_aggregate
-          else:
-            actual_step = global_step
-          epoch = actual_step // ops["num_train_batches"]
-          curr_time = time.time()
-          if global_step % FLAGS.log_every == 0:
-            log_string = ""
-            log_string += "epoch={:<6d}".format(epoch)
-            log_string += "ch_step={:<6d}".format(global_step)
-            log_string += " loss={:<8.6f}".format(loss)
-            log_string += " lr={:<8.4f}".format(lr)
-            log_string += " |g|={:<8.4f}".format(gn)
-            log_string += " tr_acc={:<3d}/{:>3d}".format(
-                tr_acc, FLAGS.batch_size)
-            log_string += " mins={:<10.2f}".format(
-                float(curr_time - start_time) / 60)
-            print(log_string)
-
-          if actual_step % ops["eval_every"] == 0:
-            if (FLAGS.controller_training and
-                epoch % FLAGS.controller_train_every == 0):
-              print("Epoch {}: Training controller".format(epoch))
-              for ct_step in range(FLAGS.controller_train_steps *
-                                    FLAGS.controller_num_aggregate):
+
+def train():
+    if FLAGS.child_fixed_arc is None:
+        images, labels = read_data(FLAGS.data_path, dataset=FLAGS.dataset)
+    else:
+        images, labels = read_data(
+            FLAGS.data_path, num_valids=0, dataset=FLAGS.dataset)
+
+    g = tf.Graph()
+    with g.as_default():
+        ops = get_ops(images, labels)
+        child_ops = ops["child"]
+        controller_ops = ops["controller"]
+
+        saver = tf.train.Saver(max_to_keep=2)
+        checkpoint_saver_hook = tf.train.CheckpointSaverHook(
+            FLAGS.output_dir, save_steps=child_ops["num_train_batches"], saver=saver)
+
+        hooks = [checkpoint_saver_hook]
+        if FLAGS.child_sync_replicas:
+            sync_replicas_hook = child_ops["optimizer"].make_session_run_hook(
+                True)
+            hooks.append(sync_replicas_hook)
+        if FLAGS.controller_training and FLAGS.controller_sync_replicas:
+            sync_replicas_hook = controller_ops["optimizer"].make_session_run_hook(
+                True)
+            hooks.append(sync_replicas_hook)
+
+        print("-" * 80)
+        print("Starting session")
+        config = tf.ConfigProto(allow_soft_placement=True)
+        with tf.train.SingularMonitoredSession(
+                config=config, hooks=hooks, checkpoint_dir=FLAGS.output_dir) as sess:
+            start_time = time.time()
+            print("SingularMonitoredSession started..")
+            while True:
                 run_ops = [
-                  controller_ops["loss"],
-                  controller_ops["entropy"],
-                  controller_ops["lr"],
-                  controller_ops["grad_norm"],
-                  controller_ops["valid_acc"],
-                  controller_ops["baseline"],
-                  controller_ops["skip_rate"],
-                  controller_ops["train_op"],
+                    child_ops["loss"],
+                    child_ops["loss_sec"],
+                    child_ops["lr"],
+                    child_ops["grad_norm"],
+                    child_ops["train_acc"],
+                    child_ops["train_acc_5mm_7_5deg"],
+                    child_ops["train_acc_1cm_15deg"],
+                    child_ops["train_acc_2cm_30deg"],
+                    child_ops["train_acc_4cm_60deg"],
+                    child_ops["train_acc_8cm_120deg"],
+                    child_ops["train_acc_16cm_240deg"],
+                    child_ops["train_acc_32cm_360deg"],
+                    child_ops["train_op"],
+                    child_ops["train_angle_error"],
+                    child_ops["train_cart_error"],
+                    child_ops["train_mae"],
+                    child_ops["train_preds"],
+                    child_ops["train_label"],
                 ]
-                loss, entropy, lr, gn, val_acc, bl, skip, _ = sess.run(run_ops)
-                controller_step = sess.run(controller_ops["train_step"])
-
-                if ct_step % FLAGS.log_every == 0:
-                  curr_time = time.time()
-                  log_string = ""
-                  log_string += "ctrl_step={:<6d}".format(controller_step)
-                  log_string += " loss={:<7.3f}".format(loss)
-                  log_string += " ent={:<5.2f}".format(entropy)
-                  log_string += " lr={:<6.4f}".format(lr)
-                  log_string += " |g|={:<8.4f}".format(gn)
-                  log_string += " acc={:<6.4f}".format(val_acc)
-                  log_string += " bl={:<5.2f}".format(bl)
-                  log_string += " mins={:<.2f}".format(
-                      float(curr_time - start_time) / 60)
-                  print(log_string)
-
-              print("Here are 10 architectures")
-              for _ in range(10):
-                arc, acc = sess.run([
-                  controller_ops["sample_arc"],
-                  controller_ops["valid_acc"],
-                ])
-                if FLAGS.search_for == "micro":
-                  normal_arc, reduce_arc = arc
-                  print(np.reshape(normal_arc, [-1]))
-                  print(np.reshape(reduce_arc, [-1]))
+                loss, loss_sec, lr, gn, tr_acc, tr_acc_5_7_5, tr_acc_1_15, tr_acc_2_30, tr_acc_4_60, tr_acc_8_120, tr_acc_16_240, tr_acc_32_360, tr_op, tr_angle_error, tr_cart_error, tr_mae, tr_preds, tr_label = sess.run(
+                    run_ops)
+                global_step = sess.run(child_ops["global_step"])
+                print("---------------global step", global_step, end="\r")
+
+                if FLAGS.child_sync_replicas:
+                    actual_step = global_step * FLAGS.num_aggregate
                 else:
-                  start = 0
-                  for layer_id in range(FLAGS.child_num_layers):
-                    if FLAGS.controller_search_whole_channels:
-                      end = start + 1 + layer_id
+                    actual_step = global_step
+                epoch = actual_step // ops["num_train_batches"]
+                curr_time = time.time()
+                if global_step % FLAGS.log_every == 0:
+                    log_string = "\n"
+                    log_string += "epoch={:<6d}".format(epoch)
+                    log_string += "ch_step={:<6d}".format(global_step)
+                    log_string += " child_loss={}".format(loss)
+                    log_string += " child_loss_sec={}".format(loss_sec)
+                    # log_string += " child_loss={:<8.6f}".format(loss)
+                    # log_string += " child_loss_sec={:<8.6f}".format(loss_sec)
+                    log_string += " lr={:<8.4f}".format(lr)
+                    log_string += " |g|={:<8.4f}".format(gn)
+                    log_string += " child_tr_acc={:<3f}".format(
+                        tr_acc)
+                    log_string += "\nchild_tr_acc_5mm_7_5deg={:<3f}".format(
+                        tr_acc_5_7_5)
+                    log_string += "\nchild_tr_acc_1cm_15deg={:<3f}".format(
+                        tr_acc_1_15)
+                    log_string += "\nchild_tr_acc_2cm_30deg={:<3f}".format(
+                        tr_acc_2_30)
+                    log_string += "\nchild_tr_acc_4cm_60deg={:<3f}".format(
+                        tr_acc_4_60)
+                    log_string += "\nchild_tr_acc_8cm_120deg={:<3f}".format(
+                        tr_acc_8_120)
+                    log_string += "\nchild_tr_acc_16cm_240deg={:<3f}".format(
+                        tr_acc_16_240)
+                    log_string += "\nchild_tr_acc_32cm_360deg={:<3f}".format(
+                        tr_acc_32_360)
+                    log_string += " mins={:<10.2f}".format(
+                        float(curr_time - start_time) / 60)
+                    if FLAGS.dataset == "stacking":
+                        if FLAGS.translation_only is False and FLAGS.stacking_reward is False:
+                            log_string += "\ntr_ang_error={}".format(tr_angle_error)
+                        if FLAGS.rotation_only is False and FLAGS.stacking_reward is False:
+                            log_string += " tr_cart_error={}".format(tr_cart_error)
+                        log_string += " tr_mae={}".format(tr_mae)
+                        log_string += "\ntr_preds={}".format(tr_preds)
+                        log_string += "\ntr_label={}".format(tr_label)
+                    print(log_string)
+                    if os.path.exists(os.path.join(FLAGS.output_dir,"train_metrics.csv")):
+                        file_mode = 'a'
                     else:
-                      end = start + 2 * FLAGS.child_num_branches + layer_id
-                    print(np.reshape(arc[start: end], [-1]))
-                    start = end
-                print("val_acc={:<6.4f}".format(acc))
-                print("-" * 80)
-
-            print("Epoch {}: Eval".format(epoch))
-            if FLAGS.child_fixed_arc is None:
-              ops["eval_func"](sess, "valid")
-            ops["eval_func"](sess, "test")
-
-          if epoch >= FLAGS.num_epochs:
-            break
+                        file_mode = 'w+'
+                    with open(os.path.join(FLAGS.output_dir, "train_metrics.csv"), file_mode) as fp:
+                        fp.write("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format(
+                            epoch, global_step, loss, loss_sec, tr_acc, tr_acc_5_7_5, tr_acc_1_15, tr_acc_2_30, tr_acc_4_60, tr_acc_8_120, tr_acc_16_240, tr_acc_32_360, tr_op, tr_angle_error, tr_cart_error, tr_mae))
+
+                if actual_step % ops["eval_every"] == 0:
+                    if (FLAGS.controller_training and
+                            epoch % FLAGS.controller_train_every == 0):
+                        print("Epoch {}: Training controller".format(epoch))
+                        for ct_step in range(FLAGS.controller_train_steps *
+                                             FLAGS.controller_num_aggregate):
+                            run_ops = [
+                                controller_ops["loss"],
+                                controller_ops["entropy"],
+                                controller_ops["lr"],
+                                controller_ops["grad_norm"],
+                                controller_ops["valid_acc"],
+                                controller_ops["baseline"],
+                                controller_ops["reward"],
+                                controller_ops["mse"],
+                                controller_ops["cart_error"],
+                                controller_ops["angle_error"],
+                                controller_ops["mae"],
+                                controller_ops["skip_rate"],
+                                controller_ops["train_op"],
+                            ]
+                            loss, entropy, lr, gn, val_acc, bl, reward, c_mse, cart_error, angle_error, mae, skip, _ = sess.run(
+                                run_ops)
+                            controller_step = sess.run(
+                                controller_ops["train_step"])
+
+                            if ct_step % FLAGS.log_every == 0:
+                                curr_time = time.time()
+                                log_string = "\n"
+                                log_string += "ctrl_step={:<6d}".format(
+                                    controller_step)
+                                log_string += " controller_loss={:<7.3f}".format(loss)
+                                log_string += " ent={:<5.2f}".format(entropy)
+                                log_string += " lr={:<6.4f}".format(lr)
+                                log_string += " |g|={:<8.4f}".format(gn)
+                                log_string += " acc={:<6.4f}".format(val_acc)
+                                log_string += " bl={:<5.2f}".format(bl)
+                                log_string += " mins={:<.2f}".format(
+                                    float(curr_time - start_time) / 60)
+                                log_string += " rw ={}".format(reward)
+                                log_string += " loss ={}".format(c_mse)
+                                if FLAGS.dataset == "stacking":
+                                    if FLAGS.rotation_only is False and FLAGS.stacking_reward is False:
+                                        log_string += "\ncart_error={}".format(cart_error)
+                                    if FLAGS.translation_only is False and FLAGS.stacking_reward is False:
+                                        log_string += "\nangle_error={}".format(angle_error)
+                                    log_string += "\nmae={}".format(mae)
+                                # log_string += "\n g_emb = {}".format(g_emb)
+                                print(log_string)
+                                if os.path.exists(os.path.join(FLAGS.output_dir, "controller_metrics.csv")):
+                                    file_mode = 'a'
+                                else:
+                                    file_mode = 'w+'
+                                with open(os.path.join(FLAGS.output_dir, "controller_metrics.csv"), file_mode) as fp:
+                                    fp.write("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format(epoch, controller_step, loss, entropy, lr, gn, val_acc, bl, reward, c_mse, cart_error, angle_error, mae))
+
+                        print("Here are 10 architectures")
+                        for _ in range(10):
+                            arc, acc, c_loss, mse, selected_cart_error, selected_angle_error, selected_mae = sess.run([
+                                controller_ops["sample_arc"],
+                                controller_ops["valid_acc"],
+                                controller_ops["loss"],
+                                controller_ops["mse"],
+                                controller_ops["cart_error"],
+                                controller_ops["angle_error"],
+                                controller_ops["mae"],
+                            ])
+                            if FLAGS.search_for == "micro":
+                                normal_arc, reduce_arc = arc
+                                print(np.reshape(normal_arc, [-1]))
+                                print(np.reshape(reduce_arc, [-1]))
+                            else:
+                                start = 0
+                                for layer_id in range(FLAGS.child_num_layers):
+                                    if FLAGS.controller_search_whole_channels:
+                                        end = start + 1 + layer_id
+                                    else:
+                                        end = start + 2 * FLAGS.child_num_branches + layer_id
+                                    print(np.reshape(arc[start: end], [-1]))
+                                    start = end
+                            print("val_acc={:<6.4f}".format(acc))
+                            print("controller_loss={}".format(c_loss))
+                            if FLAGS.dataset == "stacking":
+                                print("mse={}".format(mse))
+                                if FLAGS.rotation_only is False and FLAGS.stacking_reward is False:
+                                    print("cart_error={}".format(selected_cart_error))
+                                if FLAGS.translation_only is False and FLAGS.stacking_reward is False:
+                                    print("angle_error={}".format(selected_angle_error))
+                                print("mae={}".format(selected_mae))
+                            print("-" * 80)
+
+                    print("Epoch {}: Eval".format(epoch))
+                    # print(np.reshape(normal_arc, [-1]))
+                    # print(np.reshape(reduce_arc, [-1]))
+                    ops["eval_func"](sess, "valid")
+                    # print(np.reshape(normal_arc, [-1]))
+                    # print(np.reshape(reduce_arc, [-1]))
+                    ops["eval_func"](sess, "test")
+
+                if epoch >= FLAGS.num_epochs:
+                    break
 
 
 def main(_):
-  print("-" * 80)
-  if not os.path.isdir(FLAGS.output_dir):
-    print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
-    os.makedirs(FLAGS.output_dir)
-  elif FLAGS.reset_output_dir:
-    print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
-    shutil.rmtree(FLAGS.output_dir)
-    os.makedirs(FLAGS.output_dir)
-
-  print("-" * 80)
-  log_file = os.path.join(FLAGS.output_dir, "stdout")
-  print("Logging to {}".format(log_file))
-  sys.stdout = Logger(log_file)
+    print("-" * 80)
+    if not os.path.isdir(FLAGS.output_dir):
+        print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
+        os.makedirs(FLAGS.output_dir)
+    elif FLAGS.reset_output_dir:
+        print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
+        shutil.rmtree(FLAGS.output_dir)
+        os.makedirs(FLAGS.output_dir)
 
-  utils.print_user_flags()
-  train()
+    print("-" * 80)
+    log_file = os.path.join(FLAGS.output_dir, "stdout")
+    print("Logging to {}".format(log_file))
+    sys.stdout = Logger(log_file)
+
+    utils.print_user_flags()
+    with open(os.path.join(FLAGS.output_dir, "controller_metrics.csv"), 'w') as fp:
+        fp.write("epoch, controller_step, loss, entropy, lr, gn, val_acc, bl, reward, c_mse, cart_error, angle_error, mae")
+    with open(os.path.join(FLAGS.output_dir, "train_metrics.csv"), 'w') as fp:
+        fp.write("epoch, global_step, loss, loss_sec, tr_acc, tr_acc_5_7_5, tr_acc_1_15, tr_acc_2_30, tr_acc_4_60, tr_acc_8_120, tr_acc_16_240, tr_acc_32_360, tr_op, tr_angle_error, tr_cart_error, tr_mae")
+    with open(os.path.join(FLAGS.output_dir, "valid_metrics.csv"), 'w') as fp:
+        fp.write("total_acc, total_acc_5mm_7_5deg, total_acc_1cm_15deg, total_acc_2_30, total_acc_4_60, total_acc_8_120, total_acc_16cm_240deg, total_acc_32cm_360deg, total_loss, total_mae, total_angle_error, total_cart_error, total_loss_sec")
+    with open(os.path.join(FLAGS.output_dir, "test_metrics.csv"), 'w') as fp:
+        fp.write("total_acc, total_acc_5mm_7_5deg, total_acc_1cm_15deg, total_acc_2_30, total_acc_4_60, total_acc_8_120, total_acc_16cm_240deg, total_acc_32cm_360deg, total_loss, total_mae, total_angle_error, total_cart_error, total_loss_sec")
+    train()
 
 
 if __name__ == "__main__":
-  tf.app.run()
+    tf.app.run()
diff --git a/enas/cifar10/micro_child.py b/enas/cifar10/micro_child.py
index 5102b98..a79992a 100644
--- a/enas/cifar10/micro_child.py
+++ b/enas/cifar10/micro_child.py
@@ -4,6 +4,7 @@
 
 import os
 import sys
+import traceback
 
 import numpy as np
 import tensorflow as tf
@@ -11,812 +12,1446 @@
 from enas.cifar10.models import Model
 from enas.cifar10.image_ops import conv
 from enas.cifar10.image_ops import fully_connected
-from enas.cifar10.image_ops import batch_norm
+from enas.cifar10.image_ops import norm
 from enas.cifar10.image_ops import batch_norm_with_mask
 from enas.cifar10.image_ops import relu
 from enas.cifar10.image_ops import max_pool
 from enas.cifar10.image_ops import drop_path
-from enas.cifar10.image_ops import global_avg_pool
+from enas.cifar10.image_ops import global_max_pool
 
 from enas.utils import count_model_params
 from enas.utils import get_train_ops
 from enas.common_ops import create_weight
+import keras
+
+import grasp_metrics
+
 
 class MicroChild(Model):
-  def __init__(self,
-               images,
-               labels,
-               use_aux_heads=False,
-               cutout_size=None,
-               fixed_arc=None,
-               num_layers=2,
-               num_cells=5,
-               out_filters=24,
-               keep_prob=1.0,
-               drop_path_keep_prob=None,
-               batch_size=32,
-               clip_mode=None,
-               grad_bound=None,
-               l2_reg=1e-4,
-               lr_init=0.1,
-               lr_dec_start=0,
-               lr_dec_every=10000,
-               lr_dec_rate=0.1,
-               lr_cosine=False,
-               lr_max=None,
-               lr_min=None,
-               lr_T_0=None,
-               lr_T_mul=None,
-               num_epochs=None,
-               optim_algo=None,
-               sync_replicas=False,
-               num_aggregate=None,
-               num_replicas=None,
-               data_format="NHWC",
-               name="child",
-               **kwargs
-              ):
-    """
-    """
-
-    super(self.__class__, self).__init__(
-      images,
-      labels,
-      cutout_size=cutout_size,
-      batch_size=batch_size,
-      clip_mode=clip_mode,
-      grad_bound=grad_bound,
-      l2_reg=l2_reg,
-      lr_init=lr_init,
-      lr_dec_start=lr_dec_start,
-      lr_dec_every=lr_dec_every,
-      lr_dec_rate=lr_dec_rate,
-      keep_prob=keep_prob,
-      optim_algo=optim_algo,
-      sync_replicas=sync_replicas,
-      num_aggregate=num_aggregate,
-      num_replicas=num_replicas,
-      data_format=data_format,
-      name=name)
-
-    if self.data_format == "NHWC":
-      self.actual_data_format = "channels_last"
-    elif self.data_format == "NCHW":
-      self.actual_data_format = "channels_first"
-    else:
-      raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-
-    self.use_aux_heads = use_aux_heads
-    self.num_epochs = num_epochs
-    self.num_train_steps = self.num_epochs * self.num_train_batches
-    self.drop_path_keep_prob = drop_path_keep_prob
-    self.lr_cosine = lr_cosine
-    self.lr_max = lr_max
-    self.lr_min = lr_min
-    self.lr_T_0 = lr_T_0
-    self.lr_T_mul = lr_T_mul
-    self.out_filters = out_filters
-    self.num_layers = num_layers
-    self.num_cells = num_cells
-    self.fixed_arc = fixed_arc
-
-    self.global_step = tf.Variable(
-      0, dtype=tf.int32, trainable=False, name="global_step")
-
-    if self.drop_path_keep_prob is not None:
-      assert num_epochs is not None, "Need num_epochs to drop_path"
-
-    pool_distance = self.num_layers // 3
-    self.pool_layers = [pool_distance, 2 * pool_distance + 1]
-
-    if self.use_aux_heads:
-      self.aux_head_indices = [self.pool_layers[-1] + 1]
-
-  def _factorized_reduction(self, x, out_filters, stride, is_training):
-    """Reduces the shape of x without information loss due to striding."""
-    assert out_filters % 2 == 0, (
-        "Need even number of filters when using this factorized reduction.")
-    if stride == 1:
-      with tf.variable_scope("path_conv"):
-        inp_c = self._get_C(x)
-        w = create_weight("w", [1, 1, inp_c, out_filters])
-        x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
-                         data_format=self.data_format)
-        x = batch_norm(x, is_training, data_format=self.data_format)
+    def __init__(self,
+                 images,
+                 labels,
+                 use_aux_heads=False,
+                 cutout_size=None,
+                 fixed_arc=None,
+                 num_layers=2,
+                 num_cells=5,
+                 out_filters=24,
+                 keep_prob=1.0,
+                 drop_path_keep_prob=None,
+                 batch_size=32,
+                 clip_mode=None,
+                 grad_bound=None,
+                 l2_reg=1e-4,
+                 lr_init=0.1,
+                 lr_dec_start=0,
+                 lr_dec_every=10000,
+                 lr_dec_rate=0.1,
+                 lr_cosine=False,
+                 lr_max=None,
+                 lr_min=None,
+                 lr_T_0=None,
+                 lr_T_mul=None,
+                 num_epochs=None,
+                 optim_algo=None,
+                 sync_replicas=False,
+                 num_aggregate=None,
+                 num_replicas=None,
+                 data_format="NHWC",
+                 name="child",
+                 valid_set_size=32,
+                 image_shape=(32, 32, 3),
+                 translation_only=False,
+                 rotation_only=False,
+                 stacking_reward=False,
+                 use_root=False,
+                 one_hot_encoding=False,
+                 dataset="cifar",
+                 data_base_path="",
+                 output_dir="",
+                 pool_distance=2,
+                 use_msle=False,
+                 **kwargs
+                 ):
+
+        super(self.__class__, self).__init__(
+            images,
+            labels,
+            cutout_size=cutout_size,
+            batch_size=batch_size,
+            clip_mode=clip_mode,
+            grad_bound=grad_bound,
+            l2_reg=l2_reg,
+            lr_init=lr_init,
+            lr_dec_start=lr_dec_start,
+            lr_dec_every=lr_dec_every,
+            lr_dec_rate=lr_dec_rate,
+            keep_prob=keep_prob,
+            optim_algo=optim_algo,
+            sync_replicas=sync_replicas,
+            num_aggregate=num_aggregate,
+            num_replicas=num_replicas,
+            data_format=data_format,
+            name=name,
+            valid_set_size=valid_set_size,
+            image_shape=image_shape,
+            translation_only=translation_only,
+            rotation_only=rotation_only,
+            stacking_reward=stacking_reward,
+            data_base_path=data_base_path,
+            use_root=use_root,
+            one_hot_encoding=one_hot_encoding,
+            dataset=dataset)
+
+        if self.data_format == "NHWC":
+            self.actual_data_format = "channels_last"
+        elif self.data_format == "NCHW":
+            self.actual_data_format = "channels_first"
+        else:
+            raise ValueError(
+                "Unknown data_format '{0}'".format(self.data_format))
+
+        self.use_aux_heads = use_aux_heads
+        self.use_root = use_root
+        self.num_epochs = num_epochs
+        self.num_train_steps = self.num_epochs * self.num_train_batches
+        self.drop_path_keep_prob = drop_path_keep_prob
+        self.lr_cosine = lr_cosine
+        self.lr_max = lr_max
+        self.lr_min = lr_min
+        self.lr_T_0 = lr_T_0
+        self.lr_T_mul = lr_T_mul
+        self.out_filters = out_filters
+        self.num_layers = num_layers
+        self.num_cells = num_cells
+        self.fixed_arc = fixed_arc
+        self.translation_only = translation_only
+        self.rotation_only = rotation_only
+        self.stacking_reward = stacking_reward
+        self.data_base_path = data_base_path
+        self.verbose = 0
+        self.output_dir = output_dir
+        self.one_hot_encoding = one_hot_encoding
+        self.use_msle = use_msle
+
+        self.global_step = tf.Variable(
+            0, dtype=tf.int32, trainable=False, name="global_step")
+
+        if self.drop_path_keep_prob is not None:
+            assert num_epochs is not None, "Need num_epochs to drop_path"
+
+        self.pool_distance = pool_distance
+        # pool_distance was originally based on the number of layers
+        # pool_distance = self.num_layers // 3
+        # self.pool_layers = [pool_distance, 2 * pool_distance + 1]
+
+        self.pool_layers = []
+        for layer_num in range(self.num_layers):
+            if layer_num != 0 and layer_num % pool_distance == 0:
+                self.pool_layers += [layer_num]
+
+        if self.use_aux_heads:
+            if len(self.pool_layers) > 2:
+                pool_index = int(len(self.pool_layers) / 2)
+                self.aux_head_indices = [self.pool_layers[pool_index] + 1]
+            else:
+                self.aux_head_indices = [self.pool_layers[-1] + 1]
+
+    def _factorized_reduction(self, x, out_filters, stride, is_training):
+        """Reduces the shape of x without information loss due to striding."""
+        assert out_filters % 2 == 0, (
+            "Need even number of filters when using this factorized\
+                reduction.")
+        if stride == 1:
+            with tf.variable_scope("path_conv"):
+                inp_c = self._get_C(x)
+                w = create_weight("w", [1, 1, inp_c, out_filters])
+                x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
+                                 data_format=self.data_format)
+                x = norm(x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+                return x
+
+        stride_spec = self._get_strides(stride)
+        # Skip path 1
+        path1 = tf.nn.max_pool(
+            x, [1, 1, 1, 1], stride_spec, "VALID",
+            data_format=self.data_format)
+        with tf.variable_scope("path1_conv"):
+            inp_c = self._get_C(path1)
+            w = create_weight("w", [1, 1, inp_c, out_filters // 2])
+            path1 = tf.nn.conv2d(path1, w, [1, 1, 1, 1], "VALID",
+                                 data_format=self.data_format)
+
+        # Skip path 2
+        # First pad with 0"s on the right and bottom, then shift the filter to
+        # include those 0"s that were added.
+        if self.data_format == "NHWC":
+            pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
+            path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :]
+            concat_axis = 3
+        else:
+            pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]]
+            path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:]
+            concat_axis = 1
+
+        path2 = tf.nn.max_pool(
+            path2, [1, 1, 1, 1], stride_spec, "VALID",
+            data_format=self.data_format)
+        with tf.variable_scope("path2_conv"):
+            inp_c = self._get_C(path2)
+            w = create_weight("w", [1, 1, inp_c, out_filters // 2])
+            path2 = tf.nn.conv2d(path2, w, [1, 1, 1, 1], "VALID",
+                                 data_format=self.data_format)
+
+        # Concat and apply BN
+        final_path = tf.concat(values=[path1, path2], axis=concat_axis)
+        final_path = norm(final_path, is_training=is_training,
+                          data_format=self.data_format, norm_type="batch")
+
+        return final_path
+
+    def _get_C(self, x):
+        """
+        Args:
+          x: tensor of shape [N, H, W, C] or [N, C, H, W]
+        """
+        if self.data_format == "NHWC":
+            assert x.get_shape().as_list()[3] is not None
+            return x.get_shape()[3].value
+        elif self.data_format == "NCHW":
+            assert x.get_shape().as_list()[1] is not None
+            return x.get_shape()[1].value
+        else:
+            raise ValueError(
+                "Unknown data_format '{0}'".format(self.data_format))
+
+    def _get_HW(self, x):
+        """
+        Args:
+          x: tensor of shape [N, H, W, C] or [N, C, H, W]
+        """
+        assert x.get_shape().as_list()[2] is not None
+        return x.get_shape()[2].value
+
+    def _get_strides(self, stride):
+        """
+        Args:
+          x: tensor of shape [N, H, W, C] or [N, C, H, W]
+        """
+        if self.data_format == "NHWC":
+            return [1, stride, stride, 1]
+        elif self.data_format == "NCHW":
+            return [1, 1, stride, stride]
+        else:
+            raise ValueError(
+                "Unknown data_format '{0}'".format(self.data_format))
+
+    def _apply_drop_path(self, x, layer_id):
+        drop_path_keep_prob = self.drop_path_keep_prob
+
+        layer_ratio = float(layer_id + 1) / (self.num_layers + 2)
+        drop_path_keep_prob = 1.0 - layer_ratio * (1.0 - drop_path_keep_prob)
+
+        step_ratio = tf.to_float(self.global_step + 1) / \
+            tf.to_float(self.num_train_steps)
+        step_ratio = tf.minimum(1.0, step_ratio)
+        drop_path_keep_prob = 1.0 - step_ratio * (1.0 - drop_path_keep_prob)
+
+        x = drop_path(x, drop_path_keep_prob)
         return x
 
-    stride_spec = self._get_strides(stride)
-    # Skip path 1
-    path1 = tf.nn.avg_pool(
-        x, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format)
-    with tf.variable_scope("path1_conv"):
-      inp_c = self._get_C(path1)
-      w = create_weight("w", [1, 1, inp_c, out_filters // 2])
-      path1 = tf.nn.conv2d(path1, w, [1, 1, 1, 1], "VALID",
-                           data_format=self.data_format)
-
-    # Skip path 2
-    # First pad with 0"s on the right and bottom, then shift the filter to
-    # include those 0"s that were added.
-    if self.data_format == "NHWC":
-      pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
-      path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :]
-      concat_axis = 3
-    else:
-      pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]]
-      path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:]
-      concat_axis = 1
-
-    path2 = tf.nn.avg_pool(
-        path2, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format)
-    with tf.variable_scope("path2_conv"):
-      inp_c = self._get_C(path2)
-      w = create_weight("w", [1, 1, inp_c, out_filters // 2])
-      path2 = tf.nn.conv2d(path2, w, [1, 1, 1, 1], "VALID",
-                           data_format=self.data_format)
-
-    # Concat and apply BN
-    final_path = tf.concat(values=[path1, path2], axis=concat_axis)
-    final_path = batch_norm(final_path, is_training,
-                            data_format=self.data_format)
-
-    return final_path
-
-  def _get_C(self, x):
-    """
-    Args:
-      x: tensor of shape [N, H, W, C] or [N, C, H, W]
-    """
-    if self.data_format == "NHWC":
-      return x.get_shape()[3].value
-    elif self.data_format == "NCHW":
-      return x.get_shape()[1].value
-    else:
-      raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-
-  def _get_HW(self, x):
-    """
-    Args:
-      x: tensor of shape [N, H, W, C] or [N, C, H, W]
-    """
-    return x.get_shape()[2].value
-
-  def _get_strides(self, stride):
-    """
-    Args:
-      x: tensor of shape [N, H, W, C] or [N, C, H, W]
-    """
-    if self.data_format == "NHWC":
-      return [1, stride, stride, 1]
-    elif self.data_format == "NCHW":
-      return [1, 1, stride, stride]
-    else:
-      raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-
-  def _apply_drop_path(self, x, layer_id):
-    drop_path_keep_prob = self.drop_path_keep_prob
-
-    layer_ratio = float(layer_id + 1) / (self.num_layers + 2)
-    drop_path_keep_prob = 1.0 - layer_ratio * (1.0 - drop_path_keep_prob)
-
-    step_ratio = tf.to_float(self.global_step + 1) / tf.to_float(self.num_train_steps)
-    step_ratio = tf.minimum(1.0, step_ratio)
-    drop_path_keep_prob = 1.0 - step_ratio * (1.0 - drop_path_keep_prob)
-
-    x = drop_path(x, drop_path_keep_prob)
-    return x
-
-  def _maybe_calibrate_size(self, layers, out_filters, is_training):
-    """Makes sure layers[0] and layers[1] have the same shapes."""
-
-    hw = [self._get_HW(layer) for layer in layers]
-    c = [self._get_C(layer) for layer in layers]
-
-    with tf.variable_scope("calibrate"):
-      x = layers[0]
-      if hw[0] != hw[1]:
-        assert hw[0] == 2 * hw[1]
-        with tf.variable_scope("pool_x"):
-          x = tf.nn.relu(x)
-          x = self._factorized_reduction(x, out_filters, 2, is_training)
-      elif c[0] != out_filters:
-        with tf.variable_scope("pool_x"):
-          w = create_weight("w", [1, 1, c[0], out_filters])
-          x = tf.nn.relu(x)
-          x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
-                           data_format=self.data_format)
-          x = batch_norm(x, is_training, data_format=self.data_format)
-
-      y = layers[1]
-      if c[1] != out_filters:
-        with tf.variable_scope("pool_y"):
-          w = create_weight("w", [1, 1, c[1], out_filters])
-          y = tf.nn.relu(y)
-          y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
-                           data_format=self.data_format)
-          y = batch_norm(y, is_training, data_format=self.data_format)
-    return [x, y]
-
-  def _model(self, images, is_training, reuse=False):
-    """Compute the logits given the images."""
-
-    if self.fixed_arc is None:
-      is_training = True
-
-    with tf.variable_scope(self.name, reuse=reuse):
-      # the first two inputs
-      input_channels = self._get_C(images)
-      with tf.variable_scope("stem_conv"):
-        w = create_weight("w", [input_channels, input_channels, input_channels, self.out_filters * 3])
-        x = tf.nn.conv2d(
-          images, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-        x = batch_norm(x, is_training, data_format=self.data_format)
-      if self.data_format == "NHWC":
-        split_axis = 3
-      elif self.data_format == "NCHW":
-        split_axis = 1
-      else:
-        raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-      layers = [x, x]
-
-      # building layers in the micro space
-      out_filters = self.out_filters
-      for layer_id in range(self.num_layers + 2):
-        with tf.variable_scope("layer_{0}".format(layer_id)):
-          if layer_id not in self.pool_layers:
-            if self.fixed_arc is None:
-              x = self._enas_layer(
-                layer_id, layers, self.normal_arc, out_filters)
+    def _maybe_calibrate_size(self, layers, out_filters, is_training):
+        """Makes sure layers[0] and layers[1] have the same shapes."""
+
+        hw = [self._get_HW(layer) for layer in layers]
+        c = [self._get_C(layer) for layer in layers]
+
+        with tf.variable_scope("calibrate"):
+            x = layers[0]
+            if hw[0] != hw[1]:
+                assert hw[0] == 2 * hw[1]
+                with tf.variable_scope("pool_x"):
+                    x = tf.nn.elu(x)
+                    x = self._factorized_reduction(
+                        x, out_filters, 2, is_training)
+            elif c[0] != out_filters:
+                with tf.variable_scope("pool_x"):
+                    w = create_weight("w", [1, 1, c[0], out_filters])
+                    x = tf.nn.elu(x)
+                    x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
+                                     data_format=self.data_format)
+                    x = norm(
+                        x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+
+            y = layers[1]
+            if c[1] != out_filters:
+                with tf.variable_scope("pool_y"):
+                    w = create_weight("w", [1, 1, c[1], out_filters])
+                    y = tf.nn.elu(y)
+                    y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
+                                     data_format=self.data_format)
+                    y = norm(
+                        y, is_training=is_training, data_format=self.data_format, norm_type="batch")
+        return [x, y]
+
+    def concat_images_with_tiled_vector(images, vector):
+        """Combine a set of images with a vector, tiling the vector at each pixel in the images and concatenating on the channel axis.
+
+        # Params
+
+            images: list of images with the same dimensions
+            vector: vector to tile on each image. If you have
+                more than one vector, simply concatenate them
+                all before calling this function.
+
+        # Returns
+
+        """
+        with tf.variable_scope('concat_images_with_tiled_vector'):
+            if not isinstance(images, list):
+                images = [images]
+            image_shape = K.int_shape(images[0])
+            tiled_vector = tile_vector_as_image_channels(vector, image_shape)
+            images.append(tiled_vector)
+            combined = K.concatenate(images)
+
+            return combined
+
+    def _model(self, images, is_training, reuse=False):
+        """Compute the logits given the images."""
+
+        # TODO(ahundt) this line doesn't seem correct, because if doing eval with fixed arcs, training should definitely be false
+        # if self.fixed_arc is None:
+        #     is_training = True
+
+        with tf.variable_scope(self.name, reuse=reuse):
+            # Conv for 2 seperate stacking images
+            if self.dataset == "stacking" and self.use_root is True:
+                # input_channels_1 = self._get_C(images[0])
+                # input_channels_2 = self._get_C(images[1])
+                with tf.variable_scope("init_root"):
+                    w_1 = create_weight(
+                        "w_1", [3, 3, 3, 64])
+                    x_1 = tf.nn.conv2d(
+                        images[:, :, :, :3], w_1, [1, 1, 1, 1], "SAME")
+                    x_1 = norm(x_1, is_training=is_training, data_format=self.data_format, norm_type="batch", name="x_1_norm")
+                    x_1 = tf.nn.elu(x_1, name='elu_x_1')
+                    w_2 = create_weight(
+                        "w_2", [3, 3, 3, 64])
+                    x_2 = tf.nn.conv2d(
+                        images[:, :, :, 3:6], w_2, [1, 1, 1, 1], "SAME")
+                    x_2 = norm(x_2, is_training=is_training, data_format=self.data_format, norm_type="batch", name="x_2_norm")
+                    x_2 = tf.nn.elu(x_2, name='elu_x_2')
+                    x_3 = tf.layers.dense(images[:, :, :, 6:], units=2048, activation=tf.nn.relu)
+                    # dropout
+                    x_3 = tf.nn.dropout(x_3, 0.25)
+                    # x_3 = tf.layers.dense(x_3, units=64, activation=tf.nn.relu)
+
+                    # dense_layer
+                # tiling of images
+                print("shape of x_1--", x_1.shape)
+                image = [x_1, x_2]
+                print("shape of x_3--", len(image))
+                x = tf.concat([x_1, x_2, x_3], axis=-1)
+                print("shape after concat", x.shape)
+
+            # the first two inputs
+            if self.dataset == "stacking" and self.use_root is True:
+                input_channels = self._get_C(x)
             else:
-              x = self._fixed_layer(
-                layer_id, layers, self.normal_arc, out_filters, 1, is_training,
-                normal_or_reduction_cell="normal")
-          else:
-            out_filters *= 2
-            if self.fixed_arc is None:
-              x = self._factorized_reduction(x, out_filters, 2, is_training)
-              layers = [layers[-1], x]
-              x = self._enas_layer(
-                layer_id, layers, self.reduce_arc, out_filters)
+                input_channels = self._get_C(images)
+            print("channels--------------------------", input_channels)
+            with tf.variable_scope("stem_conv"):
+                w = create_weight(
+                    "w", [3, 3, input_channels,
+                          self.out_filters * 3])
+                if self.use_root is True:
+                    x = tf.nn.conv2d(
+                        x, w, [1, 1, 1, 1], "SAME",
+                        data_format=self.data_format)
+                else:
+                    x = tf.nn.conv2d(
+                        images, w, [1, 1, 1, 1], "SAME",
+                        data_format=self.data_format)
+                x = norm(x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+            if self.data_format == "NHWC":
+                split_axis = 3
+            elif self.data_format == "NCHW":
+                split_axis = 1
             else:
-              x = self._fixed_layer(
-                layer_id, layers, self.reduce_arc, out_filters, 2, is_training,
-                normal_or_reduction_cell="reduction")
-          print("Layer {0:>2d}: {1}".format(layer_id, x))
-          layers = [layers[-1], x]
-
-        # auxiliary heads
-        self.num_aux_vars = 0
-        if (self.use_aux_heads and
-            layer_id in self.aux_head_indices
-            and is_training):
-          print("Using aux_head at layer {0}".format(layer_id))
-          with tf.variable_scope("aux_head"):
-            aux_logits = tf.nn.relu(x)
-            aux_logits = tf.layers.average_pooling2d(
-              aux_logits, [5, 5], [3, 3], "VALID",
-              data_format=self.actual_data_format)
-            with tf.variable_scope("proj"):
-              inp_c = self._get_C(aux_logits)
-              w = create_weight("w", [1, 1, inp_c, 128])
-              aux_logits = tf.nn.conv2d(aux_logits, w, [1, 1, 1, 1], "SAME",
-                                        data_format=self.data_format)
-              aux_logits = batch_norm(aux_logits, is_training=True,
-                                      data_format=self.data_format)
-              aux_logits = tf.nn.relu(aux_logits)
-
-            with tf.variable_scope("avg_pool"):
-              inp_c = self._get_C(aux_logits)
-              hw = self._get_HW(aux_logits)
-              w = create_weight("w", [hw, hw, inp_c, 768])
-              aux_logits = tf.nn.conv2d(aux_logits, w, [1, 1, 1, 1], "SAME",
-                                        data_format=self.data_format)
-              aux_logits = batch_norm(aux_logits, is_training=True,
-                                      data_format=self.data_format)
-              aux_logits = tf.nn.relu(aux_logits)
-
+                raise ValueError(
+                    "Unknown data_format '{0}'".format(self.data_format))
+            layers = [x, x]
+
+            # building layers in the micro space
+            out_filters = self.out_filters
+            for layer_id in range(self.num_layers + 2):
+                with tf.variable_scope("layer_{0}".format(layer_id)):
+                    if layer_id not in self.pool_layers:
+                        if self.fixed_arc is None:
+                            x = self._enas_layer(
+                                    layer_id, layers, self.normal_arc, out_filters,
+                                    is_training=is_training)
+                        else:
+                            x = self._fixed_layer(
+                                layer_id, layers, self.normal_arc, out_filters,
+                                1, is_training=is_training,
+                                normal_or_reduction_cell="normal")
+                    else:
+                        out_filters *= 2
+                        if self.fixed_arc is None:
+                            x = self._factorized_reduction(
+                                x, out_filters, 2, is_training)
+                            layers = [layers[-1], x]
+                            x = self._enas_layer(
+                                layer_id, layers, self.reduce_arc, out_filters,
+                                is_training=is_training)
+                        else:
+                            x = self._fixed_layer(
+                                layer_id, layers, self.reduce_arc, out_filters,
+                                2, is_training=is_training,
+                                normal_or_reduction_cell="reduction")
+                    print("Layer {0:>2d}: {1}".format(layer_id, x))
+                    layers = [layers[-1], x]
+
+                # auxiliary heads
+                self.num_aux_vars = 0
+                if (self.use_aux_heads and
+                    layer_id in self.aux_head_indices
+                        and is_training):
+                    print("Using aux_head at layer {0}".format(layer_id))
+                    with tf.variable_scope("aux_head"):
+                        aux_logits = tf.nn.elu(x)
+                        aux_logits = tf.layers.average_pooling2d(
+                            aux_logits, [5, 5], [3, 3], "VALID",
+                            data_format=self.actual_data_format)
+                        with tf.variable_scope("proj"):
+                            inp_c = self._get_C(aux_logits)
+                            w = create_weight("w", [1, 1, inp_c, 128])
+                            aux_logits = tf.nn.conv2d(aux_logits, w,
+                                                      [1, 1, 1, 1], "SAME",
+                                                      data_format=self.data_format)
+                            aux_logits = norm(aux_logits,
+                                              is_training=is_training,
+                                              data_format=self.data_format, norm_type="batch")
+                            aux_logits = tf.nn.elu(aux_logits)
+
+                        with tf.variable_scope("avg_pool"):
+                            inp_c = self._get_C(aux_logits)
+                            hw = self._get_HW(aux_logits)
+                            w = create_weight("w", [hw, hw, inp_c, 768])
+                            aux_logits = tf.nn.conv2d(aux_logits, w, [1, 1, 1, 1], "SAME",
+                                                      data_format=self.data_format)
+                            aux_logits = norm(aux_logits,  is_training=is_training,
+                                              data_format=self.data_format, norm_type="batch")
+                            aux_logits = tf.nn.elu(aux_logits)
+
+                        with tf.variable_scope("fc"):
+                            aux_logits = global_max_pool(aux_logits,
+                                                         data_format=self.data_format)
+                            inp_c = aux_logits.get_shape()[1].value
+                            w = create_weight("w", [inp_c, self.num_classes])
+                            aux_logits = tf.matmul(aux_logits, w)
+                            self.aux_logits = aux_logits
+
+                    aux_head_variables = [
+                        var for var in tf.trainable_variables() if (
+                            var.name.startswith(self.name) and "aux_head" in var.name)]
+                    self.num_aux_vars = count_model_params(aux_head_variables)
+                    print("Aux head uses {0} params".format(self.num_aux_vars))
+
+            x = tf.nn.elu(x)
+            x = global_max_pool(x, data_format=self.data_format)
+            if is_training and self.keep_prob is not None and self.keep_prob < 1.0:
+                x = tf.nn.dropout(x, self.keep_prob)
             with tf.variable_scope("fc"):
-              aux_logits = global_avg_pool(aux_logits,
-                                           data_format=self.data_format)
-              inp_c = aux_logits.get_shape()[1].value
-              w = create_weight("w", [inp_c, 10])
-              aux_logits = tf.matmul(aux_logits, w)
-              self.aux_logits = aux_logits
-
-          aux_head_variables = [
-            var for var in tf.trainable_variables() if (
-              var.name.startswith(self.name) and "aux_head" in var.name)]
-          self.num_aux_vars = count_model_params(aux_head_variables)
-          print("Aux head uses {0} params".format(self.num_aux_vars))
-
-      x = tf.nn.relu(x)
-      x = global_avg_pool(x, data_format=self.data_format)
-      if is_training and self.keep_prob is not None and self.keep_prob < 1.0:
-        x = tf.nn.dropout(x, self.keep_prob)
-      with tf.variable_scope("fc"):
-        inp_c = x.get_shape()[1]
-        w = create_weight("w", [inp_c, 10])
-        x = tf.matmul(x, w)
-    return x
-
-  def _fixed_conv(self, x, f_size, out_filters, stride, is_training,
-                  stack_convs=2):
-    """Apply fixed convolution.
-
-    Args:
-      stacked_convs: number of separable convs to apply.
-    """
-
-    for conv_id in range(stack_convs):
-      inp_c = self._get_C(x)
-      if conv_id == 0:
-        strides = self._get_strides(stride)
-      else:
-        strides = [1, 1, 1, 1]
-
-      with tf.variable_scope("sep_conv_{}".format(conv_id)):
-        w_depthwise = create_weight("w_depth", [f_size, f_size, inp_c, 1])
-        w_pointwise = create_weight("w_point", [1, 1, inp_c, out_filters])
-        x = tf.nn.relu(x)
-        x = tf.nn.separable_conv2d(
-          x,
-          depthwise_filter=w_depthwise,
-          pointwise_filter=w_pointwise,
-          strides=strides, padding="SAME", data_format=self.data_format)
-        x = batch_norm(x, is_training, data_format=self.data_format)
-
-    return x
-
-  def _fixed_combine(self, layers, used, out_filters, is_training,
-                     normal_or_reduction_cell="normal"):
-    """Adjust if necessary.
-
-    Args:
-      layers: a list of tf tensors of size [NHWC] of [NCHW].
-      used: a numpy tensor, [0] means not used.
-    """
-
-    out_hw = min([self._get_HW(layer)
-                  for i, layer in enumerate(layers) if used[i] == 0])
-    out = []
-
-    with tf.variable_scope("final_combine"):
-      for i, layer in enumerate(layers):
-        if used[i] == 0:
-          hw = self._get_HW(layer)
-          if hw > out_hw:
-            assert hw == out_hw * 2, ("i_hw={0} != {1}=o_hw".format(hw, out_hw))
-            with tf.variable_scope("calibrate_{0}".format(i)):
-              x = self._factorized_reduction(layer, out_filters, 2, is_training)
-          else:
-            x = layer
-          out.append(x)
-
-      if self.data_format == "NHWC":
-        out = tf.concat(out, axis=3)
-      elif self.data_format == "NCHW":
-        out = tf.concat(out, axis=1)
-      else:
-        raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-
-    return out
-
-  def _fixed_layer(self, layer_id, prev_layers, arc, out_filters, stride,
-                   is_training, normal_or_reduction_cell="normal"):
-    """
-    Args:
-      prev_layers: cache of previous layers. for skip connections
-      is_training: for batch_norm
-    """
-
-    assert len(prev_layers) == 2
-    layers = [prev_layers[0], prev_layers[1]]
-    layers = self._maybe_calibrate_size(layers, out_filters,
-                                        is_training=is_training)
-
-    with tf.variable_scope("layer_base"):
-      x = layers[1]
-      inp_c = self._get_C(x)
-      w = create_weight("w", [1, 1, inp_c, out_filters])
-      x = tf.nn.relu(x)
-      x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
-                       data_format=self.data_format)
-      x = batch_norm(x, is_training, data_format=self.data_format)
-      layers[1] = x
-
-    used = np.zeros([self.num_cells + 2], dtype=np.int32)
-    f_sizes = [3, 5]
-    for cell_id in range(self.num_cells):
-      with tf.variable_scope("cell_{}".format(cell_id)):
-        x_id = arc[4 * cell_id]
-        used[x_id] += 1
-        x_op = arc[4 * cell_id + 1]
-        x = layers[x_id]
-        x_stride = stride if x_id in [0, 1] else 1
-        with tf.variable_scope("x_conv"):
-          if x_op in [0, 1]:
-            f_size = f_sizes[x_op]
-            x = self._fixed_conv(x, f_size, out_filters, x_stride, is_training)
-          elif x_op in [2, 3]:
+                inp_c = x.get_shape()[1]
+                # print("inp_c--------------",inp_c)
+                # print("shape x model --------------", x.shape)
+                w = create_weight("w", [inp_c, self.num_classes])
+                x = tf.matmul(x, w)
+        return x
+
+    def _fixed_conv(self, x, f_size, out_filters, stride, is_training,
+                    stack_convs=2):
+        """Apply fixed convolution.
+
+        Args:
+          stacked_convs: number of separable convs to apply.
+        """
+
+        for conv_id in range(stack_convs):
             inp_c = self._get_C(x)
-            if x_op == 2:
-              x = tf.layers.average_pooling2d(
-                x, [3, 3], [x_stride, x_stride], "SAME",
-                data_format=self.actual_data_format)
+            if conv_id == 0:
+                strides = self._get_strides(stride)
             else:
-              x = tf.layers.max_pooling2d(
-                x, [3, 3], [x_stride, x_stride], "SAME",
-                data_format=self.actual_data_format)
-            if inp_c != out_filters:
-              w = create_weight("w", [1, 1, inp_c, out_filters])
-              x = tf.nn.relu(x)
-              x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
-                               data_format=self.data_format)
-              x = batch_norm(x, is_training, data_format=self.data_format)
-          else:
-            inp_c = self._get_C(x)
-            if x_stride > 1:
-              assert x_stride == 2
-              x = self._factorized_reduction(x, out_filters, 2, is_training)
-            if inp_c != out_filters:
-              w = create_weight("w", [1, 1, inp_c, out_filters])
-              x = tf.nn.relu(x)
-              x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
-              x = batch_norm(x, is_training, data_format=self.data_format)
-          if (x_op in [0, 1, 2, 3] and
-              self.drop_path_keep_prob is not None and
-              is_training):
-            x = self._apply_drop_path(x, layer_id)
-
-        y_id = arc[4 * cell_id + 2]
-        used[y_id] += 1
-        y_op = arc[4 * cell_id + 3]
-        y = layers[y_id]
-        y_stride = stride if y_id in [0, 1] else 1
-        with tf.variable_scope("y_conv"):
-          if y_op in [0, 1]:
-            f_size = f_sizes[y_op]
-            y = self._fixed_conv(y, f_size, out_filters, y_stride, is_training)
-          elif y_op in [2, 3]:
-            inp_c = self._get_C(y)
-            if y_op == 2:
-              y = tf.layers.average_pooling2d(
-                y, [3, 3], [y_stride, y_stride], "SAME",
-                data_format=self.actual_data_format)
+                strides = [1, 1, 1, 1]
+
+            with tf.variable_scope("sep_conv_{}".format(conv_id)):
+                w_depthwise = create_weight(
+                    "w_depth", [f_size, f_size, inp_c, 1])
+                w_pointwise = create_weight(
+                    "w_point", [1, 1, inp_c, out_filters])
+                x = tf.nn.elu(x)
+                x = tf.nn.separable_conv2d(
+                    x,
+                    depthwise_filter=w_depthwise,
+                    pointwise_filter=w_pointwise,
+                    strides=strides, padding="SAME", data_format=self.data_format)
+                x = norm(x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+
+        return x
+
+    def _fixed_combine(self, layers, used, out_filters, is_training,
+                       normal_or_reduction_cell="normal"):
+        """Adjust if necessary.
+
+        Args:
+          layers: a list of tf tensors of size [NHWC] of [NCHW].
+          used: a numpy tensor, [0] means not used.
+        """
+
+        out_hw = min([self._get_HW(layer)
+                      for i, layer in enumerate(layers) if used[i] == 0])
+        out = []
+
+        with tf.variable_scope("final_combine"):
+            for i, layer in enumerate(layers):
+                if used[i] == 0:
+                    hw = self._get_HW(layer)
+                    if hw > out_hw:
+                        assert hw == out_hw * \
+                            2, ("i_hw={0} != {1}=o_hw".format(hw, out_hw))
+                        with tf.variable_scope("calibrate_{0}".format(i)):
+                            x = self._factorized_reduction(
+                                layer, out_filters, 2, is_training)
+                    else:
+                        x = layer
+                    out.append(x)
+
+            if self.data_format == "NHWC":
+                out = tf.concat(out, axis=3)
+            elif self.data_format == "NCHW":
+                out = tf.concat(out, axis=1)
             else:
-              y = tf.layers.max_pooling2d(
-                y, [3, 3], [y_stride, y_stride], "SAME",
-                data_format=self.actual_data_format)
-            if inp_c != out_filters:
-              w = create_weight("w", [1, 1, inp_c, out_filters])
-              y = tf.nn.relu(y)
-              y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
-                               data_format=self.data_format)
-              y = batch_norm(y, is_training, data_format=self.data_format)
-          else:
-            inp_c = self._get_C(y)
-            if y_stride > 1:
-              assert y_stride == 2
-              y = self._factorized_reduction(y, out_filters, 2, is_training)
-            if inp_c != out_filters:
-              w = create_weight("w", [1, 1, inp_c, out_filters])
-              y = tf.nn.relu(y)
-              y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
-                               data_format=self.data_format)
-              y = batch_norm(y, is_training, data_format=self.data_format)
-
-          if (y_op in [0, 1, 2, 3] and
-              self.drop_path_keep_prob is not None and
-              is_training):
-            y = self._apply_drop_path(y, layer_id)
-
-        out = x + y
-        layers.append(out)
-    out = self._fixed_combine(layers, used, out_filters, is_training,
-                              normal_or_reduction_cell)
-
-    return out
-
-  def _enas_cell(self, x, curr_cell, prev_cell, op_id, out_filters):
-    """Performs an enas operation specified by op_id."""
-
-    num_possible_inputs = curr_cell + 1
-
-    with tf.variable_scope("avg_pool"):
-      avg_pool = tf.layers.average_pooling2d(
-        x, [3, 3], [1, 1], "SAME", data_format=self.actual_data_format)
-      avg_pool_c = self._get_C(avg_pool)
-      if avg_pool_c != out_filters:
-        with tf.variable_scope("conv"):
-          w = create_weight(
-            "w", [num_possible_inputs, avg_pool_c * out_filters])
-          w = w[prev_cell]
-          w = tf.reshape(w, [1, 1, avg_pool_c, out_filters])
-          avg_pool = tf.nn.relu(avg_pool)
-          avg_pool = tf.nn.conv2d(avg_pool, w, strides=[1, 1, 1, 1],
-                                  padding="SAME", data_format=self.data_format)
-          avg_pool = batch_norm(avg_pool, is_training=True,
-                                data_format=self.data_format)
-
-    with tf.variable_scope("max_pool"):
-      max_pool = tf.layers.max_pooling2d(
-        x, [3, 3], [1, 1], "SAME", data_format=self.actual_data_format)
-      max_pool_c = self._get_C(max_pool)
-      if max_pool_c != out_filters:
-        with tf.variable_scope("conv"):
-          w = create_weight(
-            "w", [num_possible_inputs, max_pool_c * out_filters])
-          w = w[prev_cell]
-          w = tf.reshape(w, [1, 1, max_pool_c, out_filters])
-          max_pool = tf.nn.relu(max_pool)
-          max_pool = tf.nn.conv2d(max_pool, w, strides=[1, 1, 1, 1],
-                                  padding="SAME", data_format=self.data_format)
-          max_pool = batch_norm(max_pool, is_training=True,
-                                data_format=self.data_format)
-
-    x_c = self._get_C(x)
-    if x_c != out_filters:
-      with tf.variable_scope("x_conv"):
-        w = create_weight("w", [num_possible_inputs, x_c * out_filters])
-        w = w[prev_cell]
-        w = tf.reshape(w, [1, 1, x_c, out_filters])
-        x = tf.nn.relu(x)
-        x = tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding="SAME",
-                         data_format=self.data_format)
-        x = batch_norm(x, is_training=True, data_format=self.data_format)
-
-    out = [
-      self._enas_conv(x, curr_cell, prev_cell, 3, out_filters),
-      self._enas_conv(x, curr_cell, prev_cell, 5, out_filters),
-      avg_pool,
-      max_pool,
-      x,
-    ]
-
-    out = tf.stack(out, axis=0)
-    out = out[op_id, :, :, :, :]
-    return out
-
-  def _enas_conv(self, x, curr_cell, prev_cell, filter_size, out_filters,
-                 stack_conv=2):
-    """Performs an enas convolution specified by the relevant parameters."""
-
-    with tf.variable_scope("conv_{0}x{0}".format(filter_size)):
-      num_possible_inputs = curr_cell + 2
-      for conv_id in range(stack_conv):
-        with tf.variable_scope("stack_{0}".format(conv_id)):
-          # create params and pick the correct path
-          inp_c = self._get_C(x)
-          w_depthwise = create_weight(
-            "w_depth", [num_possible_inputs, filter_size * filter_size * inp_c])
-          w_depthwise = w_depthwise[prev_cell, :]
-          w_depthwise = tf.reshape(
-            w_depthwise, [filter_size, filter_size, inp_c, 1])
-
-          w_pointwise = create_weight(
-            "w_point", [num_possible_inputs, inp_c * out_filters])
-          w_pointwise = w_pointwise[prev_cell, :]
-          w_pointwise = tf.reshape(w_pointwise, [1, 1, inp_c, out_filters])
-
-          with tf.variable_scope("bn"):
-            zero_init = tf.initializers.zeros(dtype=tf.float32)
-            one_init = tf.initializers.ones(dtype=tf.float32)
-            offset = create_weight(
-              "offset", [num_possible_inputs, out_filters],
-              initializer=zero_init)
-            scale = create_weight(
-              "scale", [num_possible_inputs, out_filters],
-              initializer=one_init)
-            offset = offset[prev_cell]
-            scale = scale[prev_cell]
-
-          # the computations
-          x = tf.nn.relu(x)
-          x = tf.nn.separable_conv2d(
+                raise ValueError(
+                    "Unknown data_format '{0}'".format(self.data_format))
+
+        return out
+
+    def _fixed_layer(self, layer_id, prev_layers, arc, out_filters, stride,
+                     is_training, normal_or_reduction_cell="normal"):
+        """
+        Args:
+          prev_layers: cache of previous layers. for skip connections
+          is_training: for batch_norm
+        """
+
+        assert len(prev_layers) == 2
+        layers = [prev_layers[0], prev_layers[1]]
+        layers = self._maybe_calibrate_size(layers, out_filters,
+                                            is_training=is_training)
+
+        with tf.variable_scope("layer_base"):
+            x = layers[1]
+            inp_c = self._get_C(x)
+            w = create_weight("w", [1, 1, inp_c, out_filters])
+            x = tf.nn.elu(x)
+            x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
+                             data_format=self.data_format)
+            x = norm(x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+            layers[1] = x
+
+        used = np.zeros([self.num_cells + 2], dtype=np.int32)
+        f_sizes = [3, 5]
+        for cell_id in range(self.num_cells):
+            with tf.variable_scope("cell_{}".format(cell_id)):
+                x_id = arc[4 * cell_id]
+                used[x_id] += 1
+                x_op = arc[4 * cell_id + 1]
+                x = layers[x_id]
+                x_stride = stride if x_id in [0, 1] else 1
+                with tf.variable_scope("x_conv"):
+                    if x_op in [0, 1]:
+                        f_size = f_sizes[x_op]
+                        x = self._fixed_conv(
+                            x, f_size, out_filters, x_stride, is_training)
+                    elif x_op in [2, 3]:
+                        inp_c = self._get_C(x)
+                        if x_op == 2:
+                            x = tf.layers.average_pooling2d(
+                                x, [3, 3], [x_stride, x_stride], "SAME",
+                                data_format=self.actual_data_format)
+                        else:
+                            x = tf.layers.max_pooling2d(
+                                x, [3, 3], [x_stride, x_stride], "SAME",
+                                data_format=self.actual_data_format)
+                        if inp_c != out_filters:
+                            w = create_weight("w", [1, 1, inp_c, out_filters])
+                            x = tf.nn.elu(x)
+                            x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
+                                             data_format=self.data_format)
+                            x = norm(
+                                x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+                    else:
+                        inp_c = self._get_C(x)
+                        if x_stride > 1:
+                            assert x_stride == 2
+                            x = self._factorized_reduction(
+                                x, out_filters, 2, is_training)
+                        if inp_c != out_filters:
+                            w = create_weight("w", [1, 1, inp_c, out_filters])
+                            x = tf.nn.elu(x)
+                            x = tf.nn.conv2d(
+                                x, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
+                            x = norm(
+                                x, is_training=is_training, data_format=self.data_format, norm_type="batch")
+                    if (x_op in [0, 1, 2, 3] and
+                        self.drop_path_keep_prob is not None and
+                            is_training):
+                        x = self._apply_drop_path(x, layer_id)
+
+                y_id = arc[4 * cell_id + 2]
+                used[y_id] += 1
+                y_op = arc[4 * cell_id + 3]
+                y = layers[y_id]
+                y_stride = stride if y_id in [0, 1] else 1
+                with tf.variable_scope("y_conv"):
+                    if y_op in [0, 1]:
+                        f_size = f_sizes[y_op]
+                        y = self._fixed_conv(
+                            y, f_size, out_filters, y_stride, is_training)
+                    elif y_op in [2, 3]:
+                        inp_c = self._get_C(y)
+                        if y_op == 2:
+                            y = tf.layers.average_pooling2d(
+                                y, [3, 3], [y_stride, y_stride], "SAME",
+                                data_format=self.actual_data_format)
+                        else:
+                            y = tf.layers.max_pooling2d(
+                                y, [3, 3], [y_stride, y_stride], "SAME",
+                                data_format=self.actual_data_format)
+                        if inp_c != out_filters:
+                            w = create_weight("w", [1, 1, inp_c, out_filters])
+                            y = tf.nn.elu(y)
+                            y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
+                                             data_format=self.data_format)
+                            y = norm(
+                                y, is_training=is_training, data_format=self.data_format, norm_type="batch")
+                    else:
+                        inp_c = self._get_C(y)
+                        if y_stride > 1:
+                            assert y_stride == 2
+                            y = self._factorized_reduction(
+                                y, out_filters, 2, is_training)
+                        if inp_c != out_filters:
+                            w = create_weight("w", [1, 1, inp_c, out_filters])
+                            y = tf.nn.elu(y)
+                            y = tf.nn.conv2d(y, w, [1, 1, 1, 1], "SAME",
+                                             data_format=self.data_format)
+                            y = norm(
+                                y, is_training=is_training, data_format=self.data_format, norm_type="batch")
+
+                    if (y_op in [0, 1, 2, 3] and
+                        self.drop_path_keep_prob is not None and
+                            is_training):
+                        y = self._apply_drop_path(y, layer_id)
+
+                out = x + y
+                layers.append(out)
+        out = self._fixed_combine(layers, used, out_filters, is_training=is_training,
+                                  normal_or_reduction_cell=normal_or_reduction_cell)
+
+        return out
+
+    def _enas_cell(self, x, curr_cell, prev_cell, op_id, out_filters, is_training):
+        """Performs an enas operation specified by op_id."""
+
+        num_possible_inputs = curr_cell + 1
+
+        with tf.variable_scope("avg_pool"):
+            avg_pool = tf.layers.average_pooling2d(
+                x, [3, 3], [1, 1], "SAME", data_format=self.actual_data_format)
+            avg_pool_c = self._get_C(avg_pool)
+            if avg_pool_c != out_filters:
+                with tf.variable_scope("conv"):
+                    w = create_weight(
+                        "w", [num_possible_inputs, avg_pool_c * out_filters])
+                    w = w[prev_cell]
+                    w = tf.reshape(w, [1, 1, avg_pool_c, out_filters])
+                    avg_pool = tf.nn.elu(avg_pool)
+                    avg_pool = tf.nn.conv2d(avg_pool, w, strides=[1, 1, 1, 1],
+                                            padding="SAME", data_format=self.data_format)
+                    avg_pool = norm(avg_pool, is_training=is_training,
+                                    data_format=self.data_format, norm_type="batch")
+
+        with tf.variable_scope("max_pool"):
+            max_pool = tf.layers.max_pooling2d(
+                x, [3, 3], [1, 1], "SAME", data_format=self.actual_data_format)
+            max_pool_c = self._get_C(max_pool)
+            if max_pool_c != out_filters:
+                with tf.variable_scope("conv"):
+                    w = create_weight(
+                        "w", [num_possible_inputs, max_pool_c * out_filters])
+                    w = w[prev_cell]
+                    w = tf.reshape(w, [1, 1, max_pool_c, out_filters])
+                    max_pool = tf.nn.elu(max_pool)
+                    max_pool = tf.nn.conv2d(max_pool, w, strides=[1, 1, 1, 1],
+                                            padding="SAME", data_format=self.data_format)
+                    max_pool = norm(max_pool, is_training=is_training,
+                                    data_format=self.data_format, norm_type="batch")
+
+        x_c = self._get_C(x)
+        if x_c != out_filters:
+            with tf.variable_scope("x_conv"):
+                w = create_weight(
+                    "w", [num_possible_inputs, x_c * out_filters])
+                w = w[prev_cell]
+                w = tf.reshape(w, [1, 1, x_c, out_filters])
+                x = tf.nn.elu(x)
+                x = tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding="SAME",
+                                 data_format=self.data_format)
+                x = norm(x, is_training=is_training,
+                         data_format=self.data_format, norm_type="batch")
+
+        out = [
+            self._enas_conv(x, curr_cell, prev_cell, 3, out_filters, is_training=is_training),
+            self._enas_conv(x, curr_cell, prev_cell, 5, out_filters, is_training=is_training),
+            avg_pool,
+            max_pool,
             x,
-            depthwise_filter=w_depthwise,
-            pointwise_filter=w_pointwise,
-            strides=[1, 1, 1, 1], padding="SAME",
-            data_format=self.data_format)
-          x, _, _ = tf.nn.fused_batch_norm(
-            x, scale, offset, epsilon=1e-5, data_format=self.data_format,
-            is_training=True)
-    return x
-
-  def _enas_layer(self, layer_id, prev_layers, arc, out_filters):
-    """
-    Args:
-      layer_id: current layer
-      prev_layers: cache of previous layers. for skip connections
-      start_idx: where to start looking at. technically, we can infer this
-        from layer_id, but why bother...
-    """
-
-    assert len(prev_layers) == 2, "need exactly 2 inputs"
-    layers = [prev_layers[0], prev_layers[1]]
-    layers = self._maybe_calibrate_size(layers, out_filters, is_training=True)
-    used = []
-    for cell_id in range(self.num_cells):
-      prev_layers = tf.stack(layers, axis=0)
-      with tf.variable_scope("cell_{0}".format(cell_id)):
-        with tf.variable_scope("x"):
-          x_id = arc[4 * cell_id]
-          x_op = arc[4 * cell_id + 1]
-          x = prev_layers[x_id, :, :, :, :]
-          x = self._enas_cell(x, cell_id, x_id, x_op, out_filters)
-          x_used = tf.one_hot(x_id, depth=self.num_cells + 2, dtype=tf.int32)
-
-        with tf.variable_scope("y"):
-          y_id = arc[4 * cell_id + 2]
-          y_op = arc[4 * cell_id + 3]
-          y = prev_layers[y_id, :, :, :, :]
-          y = self._enas_cell(y, cell_id, y_id, y_op, out_filters)
-          y_used = tf.one_hot(y_id, depth=self.num_cells + 2, dtype=tf.int32)
-
-        out = x + y
-        used.extend([x_used, y_used])
-        layers.append(out)
-
-    used = tf.add_n(used)
-    indices = tf.where(tf.equal(used, 0))
-    indices = tf.to_int32(indices)
-    indices = tf.reshape(indices, [-1])
-    num_outs = tf.size(indices)
-    out = tf.stack(layers, axis=0)
-    out = tf.gather(out, indices, axis=0)
-
-    inp = prev_layers[0]
-    if self.data_format == "NHWC":
-      N = tf.shape(inp)[0]
-      H = tf.shape(inp)[1]
-      W = tf.shape(inp)[2]
-      C = tf.shape(inp)[3]
-      out = tf.transpose(out, [1, 2, 3, 0, 4])
-      out = tf.reshape(out, [N, H, W, num_outs * out_filters])
-    elif self.data_format == "NCHW":
-      N = tf.shape(inp)[0]
-      C = tf.shape(inp)[1]
-      H = tf.shape(inp)[2]
-      W = tf.shape(inp)[3]
-      out = tf.transpose(out, [1, 0, 2, 3, 4])
-      out = tf.reshape(out, [N, num_outs * out_filters, H, W])
-    else:
-      raise ValueError("Unknown data_format '{0}'".format(self.data_format))
-
-    with tf.variable_scope("final_conv"):
-      w = create_weight("w", [self.num_cells + 2, out_filters * out_filters])
-      w = tf.gather(w, indices, axis=0)
-      w = tf.reshape(w, [1, 1, num_outs * out_filters, out_filters])
-      out = tf.nn.relu(out)
-      out = tf.nn.conv2d(out, w, strides=[1, 1, 1, 1], padding="SAME",
-                         data_format=self.data_format)
-      out = batch_norm(out, is_training=True, data_format=self.data_format)
-
-    out = tf.reshape(out, tf.shape(prev_layers[0]))
-
-    return out
-
-  # override
-  def _build_train(self):
-    print("-" * 80)
-    print("Build train graph")
-    logits = self._model(self.x_train, is_training=True)
-    log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      logits=logits, labels=self.y_train)
-    self.loss = tf.reduce_mean(log_probs)
-
-    if self.use_aux_heads:
-      log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits=self.aux_logits, labels=self.y_train)
-      self.aux_loss = tf.reduce_mean(log_probs)
-      train_loss = self.loss + 0.4 * self.aux_loss
-    else:
-      train_loss = self.loss
-
-    self.train_preds = tf.argmax(logits, axis=1)
-    self.train_preds = tf.to_int32(self.train_preds)
-    self.train_acc = tf.equal(self.train_preds, self.y_train)
-    self.train_acc = tf.to_int32(self.train_acc)
-    self.train_acc = tf.reduce_sum(self.train_acc)
-
-    tf_variables = [
-      var for var in tf.trainable_variables() if (
-        var.name.startswith(self.name) and "aux_head" not in var.name)]
-    self.num_vars = count_model_params(tf_variables)
-    print("Model has {0} params".format(self.num_vars))
-
-    self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
-      train_loss,
-      tf_variables,
-      self.global_step,
-      clip_mode=self.clip_mode,
-      grad_bound=self.grad_bound,
-      l2_reg=self.l2_reg,
-      lr_init=self.lr_init,
-      lr_dec_start=self.lr_dec_start,
-      lr_dec_every=self.lr_dec_every,
-      lr_dec_rate=self.lr_dec_rate,
-      lr_cosine=self.lr_cosine,
-      lr_max=self.lr_max,
-      lr_min=self.lr_min,
-      lr_T_0=self.lr_T_0,
-      lr_T_mul=self.lr_T_mul,
-      num_train_batches=self.num_train_batches,
-      optim_algo=self.optim_algo,
-      sync_replicas=self.sync_replicas,
-      num_aggregate=self.num_aggregate,
-      num_replicas=self.num_replicas)
-
-  # override
-  def _build_valid(self):
-    if self.x_valid is not None:
-      print("-" * 80)
-      print("Build valid graph")
-      logits = self._model(self.x_valid, False, reuse=True)
-      self.valid_preds = tf.argmax(logits, axis=1)
-      self.valid_preds = tf.to_int32(self.valid_preds)
-      self.valid_acc = tf.equal(self.valid_preds, self.y_valid)
-      self.valid_acc = tf.to_int32(self.valid_acc)
-      self.valid_acc = tf.reduce_sum(self.valid_acc)
-
-  # override
-  def _build_test(self):
-    print("-" * 80)
-    print("Build test graph")
-    logits = self._model(self.x_test, False, reuse=True)
-    self.test_preds = tf.argmax(logits, axis=1)
-    self.test_preds = tf.to_int32(self.test_preds)
-    self.test_acc = tf.equal(self.test_preds, self.y_test)
-    self.test_acc = tf.to_int32(self.test_acc)
-    self.test_acc = tf.reduce_sum(self.test_acc)
-
-  # override
-  def build_valid_rl(self, shuffle=False):
-    print("-" * 80)
-    print("Build valid graph on shuffled data")
-    with tf.device("/cpu:0"):
-      # shuffled valid data: for choosing validation model
-      if not shuffle and self.data_format == "NCHW":
-        self.images["valid_original"] = np.transpose(
-          self.images["valid_original"], [0, 3, 1, 2])
-      x_valid_shuffle, y_valid_shuffle = tf.train.shuffle_batch(
-        [self.images["valid_original"], self.labels["valid_original"]],
-        batch_size=self.batch_size,
-        capacity=25000,
-        enqueue_many=True,
-        min_after_dequeue=0,
-        num_threads=16,
-        seed=self.seed,
-        allow_smaller_final_batch=True,
-      )
-
-      def _pre_process(x):
-        x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
-        x = tf.random_crop(x, [32, 32, 3], seed=self.seed)
-        x = tf.image.random_flip_left_right(x, seed=self.seed)
-        if self.data_format == "NCHW":
-          x = tf.transpose(x, [2, 0, 1])
+        ]
+
+        out = tf.stack(out, axis=0)
+        if self.verbose > 0:
+            print('-' * 80)
+            shape_list = out.get_shape().as_list()
+            print('_enas_cell::cell op_id: ' + str(op_id) + ' out shape: ' + str(shape_list) + ' data_format: ' + str(self.data_format))
+            for line in traceback.format_stack():
+                print(line.strip())
+        out = out[op_id, :, :, :, :]
+        return out
+
+    def _enas_conv(self, x, curr_cell, prev_cell, filter_size, out_filters, is_training,
+                   stack_conv=2, norm_type='group'):
+        """Performs an enas convolution specified by the relevant parameters."""
+
+        with tf.variable_scope("conv_{0}x{0}".format(filter_size)):
+            num_possible_inputs = curr_cell + 2
+            for conv_id in range(stack_conv):
+                with tf.variable_scope("stack_{0}".format(conv_id)):
+                    # create params and pick the correct path
+                    inp_c = self._get_C(x)
+                    w_depthwise = create_weight(
+                        "w_depth", [num_possible_inputs, filter_size * filter_size * inp_c])
+                    w_depthwise = w_depthwise[prev_cell, :]
+                    w_depthwise = tf.reshape(
+                        w_depthwise, [filter_size, filter_size, inp_c, 1])
+
+                    w_pointwise = create_weight(
+                        "w_point", [num_possible_inputs, inp_c * out_filters])
+                    w_pointwise = w_pointwise[prev_cell, :]
+                    w_pointwise = tf.reshape(
+                        w_pointwise, [1, 1, inp_c, out_filters])
+
+                    # the computations
+                    x = tf.nn.elu(x)
+                    x = tf.nn.separable_conv2d(
+                        x,
+                        depthwise_filter=w_depthwise,
+                        pointwise_filter=w_pointwise,
+                        strides=[1, 1, 1, 1], padding="SAME",
+                        data_format=self.data_format)
+                    x = norm(x, is_training=is_training, norm_type="batch")
         return x
 
-      if shuffle:
-        x_valid_shuffle = tf.map_fn(
-          _pre_process, x_valid_shuffle, back_prop=False)
-
-    logits = self._model(x_valid_shuffle, is_training=True, reuse=True)
-    valid_shuffle_preds = tf.argmax(logits, axis=1)
-    valid_shuffle_preds = tf.to_int32(valid_shuffle_preds)
-    self.valid_shuffle_acc = tf.equal(valid_shuffle_preds, y_valid_shuffle)
-    self.valid_shuffle_acc = tf.to_int32(self.valid_shuffle_acc)
-    self.valid_shuffle_acc = tf.reduce_sum(self.valid_shuffle_acc)
-
-  def connect_controller(self, controller_model):
-    if self.fixed_arc is None:
-      self.normal_arc, self.reduce_arc = controller_model.sample_arc
-    else:
-      fixed_arc = np.array([int(x) for x in self.fixed_arc.split(" ") if x])
-      self.normal_arc = fixed_arc[:4 * self.num_cells]
-      self.reduce_arc = fixed_arc[4 * self.num_cells:]
-
-    self._build_train()
-    self._build_valid()
-    self._build_test()
+    def _enas_layer(self, layer_id, prev_layers, arc, out_filters, is_training):
+        """
+        Args:
+          layer_id: current layer
+          prev_layers: cache of previous layers. for skip connections
+          start_idx: where to start looking at. technically, we can infer this
+            from layer_id, but why bother...
+        """
+
+        assert len(prev_layers) == 2, "need exactly 2 inputs"
+        layers = [prev_layers[0], prev_layers[1]]
+        layers = self._maybe_calibrate_size(
+            layers, out_filters, is_training=is_training)
+        used = []
+        for cell_id in range(self.num_cells):
+            prev_layers = tf.stack(layers, axis=0)
+            with tf.variable_scope("cell_{0}".format(cell_id)):
+                with tf.variable_scope("x"):
+                    x_id = arc[4 * cell_id]
+                    x_op = arc[4 * cell_id + 1]
+                    x = prev_layers[x_id, :, :, :, :]
+                    x = self._enas_cell(x, cell_id, x_id, x_op, out_filters, is_training=is_training)
+                    x_used = tf.one_hot(
+                        x_id, depth=self.num_cells + 2, dtype=tf.int32)
+
+                with tf.variable_scope("y"):
+                    y_id = arc[4 * cell_id + 2]
+                    y_op = arc[4 * cell_id + 3]
+                    y = prev_layers[y_id, :, :, :, :]
+                    y = self._enas_cell(y, cell_id, y_id, y_op, out_filters, is_training=is_training)
+                    y_used = tf.one_hot(
+                        y_id, depth=self.num_cells + 2, dtype=tf.int32)
+
+                out = x + y
+                used.extend([x_used, y_used])
+                layers.append(out)
+                if self.verbose > 0:
+                    print('-' * 80)
+                    shape_list = out.get_shape().as_list()
+                    print('_enas_layer::cell cell_id: ' + str(cell_id) + ' out shape: ' + str(shape_list) + ' data_format: ' + str(self.data_format))
+                    for line in traceback.format_stack():
+                        print(line.strip())
+
+        used = tf.add_n(used)
+        indices = tf.where(tf.equal(used, 0))
+        indices = tf.to_int32(indices)
+        indices = tf.reshape(indices, [-1])
+        num_outs = tf.size(indices)
+        out = tf.stack(layers, axis=0)
+        out = tf.gather(out, indices, axis=0)
+
+        inp = prev_layers[0]
+        # get shape as an integer list,
+        # this is necessary to prevent some shape information being lost
+        # in the transpose/reshape below
+        inp_shape_list = inp.get_shape().as_list()
+        if self.verbose > 0:
+            print('-' * 80)
+            print('_enas_layer::inp tensor: ' + str(inp) + ' shape: ' + str(inp_shape_list) + ' data_format: ' + str(self.data_format))
+            out_shape_list = out.get_shape().as_list()
+            print('_enas_layer::out tensor: ' + str(out) + ' shape: ' + str(out_shape_list) + ' data_format: ' + str(self.data_format))
+            print('_enas_layer::num_outs: ' + str(num_outs) + ' _enas_layer::out_filters: ' + str(out_filters))
+            for line in traceback.format_stack():
+                print(line.strip())
+        if self.data_format == "NHWC":
+            N = tf.shape(inp)[0]
+            H = inp_shape_list[1]
+            W = inp_shape_list[2]
+            C = inp_shape_list[3]
+            out = tf.transpose(out, [1, 2, 3, 0, 4])
+            out = tf.reshape(out, [N, H, W, num_outs * out_filters])
+        elif self.data_format == "NCHW":
+            N = tf.shape(inp)[0]
+            C = inp_shape_list[1]
+            H = inp_shape_list[2]
+            W = inp_shape_list[3]
+            out = tf.transpose(out, [1, 0, 2, 3, 4])
+            out = tf.reshape(out, [N, num_outs * out_filters, H, W])
+        else:
+            raise ValueError(
+                "Unknown data_format '{0}'".format(self.data_format))
+
+        with tf.variable_scope("final_conv"):
+            if self.verbose > 0:
+                print('-' * 80)
+                shape_list = out.get_shape().as_list()
+                print('_enas_layer::final_conv out shape: ' + str(shape_list) + ' data_format: ' + str(self.data_format))
+                for line in traceback.format_stack():
+                    print(line.strip())
+            w = create_weight(
+                "w", [self.num_cells + 2, out_filters * out_filters])
+            w = tf.gather(w, indices, axis=0)
+            w = tf.reshape(w, [1, 1, num_outs * out_filters, out_filters])
+            out = tf.nn.elu(out)
+            out = tf.nn.conv2d(out, w, strides=[1, 1, 1, 1], padding="SAME",
+                               data_format=self.data_format)
+            out = norm(out, is_training=is_training,
+                       data_format=self.data_format, norm_type="batch")
+
+        out = tf.reshape(out, tf.shape(prev_layers[0]))
+
+        return out
+
+    # override
+    def eval_once(self, sess, eval_set, feed_dict=None, verbose=False):
+        """Expects self.acc and self.global_step to be defined.
+
+        Args:
+          sess: tf.Session() or one of its wrap arounds.
+          feed_dict: can be used to give more information to sess.run().
+          eval_set: "valid" or "test"
+        """
+
+        assert self.global_step is not None
+        global_step = sess.run(self.global_step)
+        print("Eval {} set at {}".format(eval_set, global_step))
+
+        if eval_set == "valid":
+            assert self.x_valid is not None
+            assert self.valid_acc is not None
+            num_examples = self.num_valid_examples
+            num_batches = self.num_valid_batches
+            acc_op = self.valid_acc
+            acc_op_5mm_7_5deg = self.valid_acc_5mm_7_5deg
+            acc_op_1cm_15deg = self.valid_acc_1cm_15deg
+            acc_op_2_30 = self.valid_acc_2cm_30deg
+            acc_op_4_60 = self.valid_acc_4cm_60deg
+            acc_op_8_120 = self.valid_acc_8cm_120deg
+            acc_op_16cm_240deg = self.valid_acc_16cm_240deg
+            acc_op_32cm_360deg = self.valid_acc_32cm_360deg
+            loss_secondary_op = self.valid_loss_secondary
+            cart_op = self.valid_cart_error
+            ang_er_op = self.valid_angle_error
+            loss_op = self.valid_loss
+            mae_op = self.valid_mae
+            csvfile = self.output_dir + "/valid_metrics.csv"
+        elif eval_set == "test":
+            assert self.test_acc is not None
+            num_examples = self.num_test_examples
+            num_batches = self.num_test_batches
+            acc_op = self.test_acc
+            acc_op_5mm_7_5deg = self.test_acc_5mm_7_5deg
+            acc_op_1cm_15deg = self.test_acc_1cm_15deg
+            acc_op_2_30 = self.test_acc_2cm_30deg
+            acc_op_4_60 = self.test_acc_4cm_60deg
+            acc_op_8_120 = self.test_acc_8cm_120deg
+            acc_op_16cm_240deg = self.test_acc_16cm_240deg
+            acc_op_32cm_360deg = self.test_acc_32cm_360deg
+            loss_secondary_op = self.test_loss_secondary
+            ang_er_op = self.test_angle_error
+            cart_op = self.test_cart_error
+            loss_op = self.test_loss
+            mae_op = self.test_mae
+            csvfile = self.output_dir + "/test_metrics.csv"
+        else:
+            raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
+
+        total_acc = 0
+        total_acc_5mm_7_5deg = 0
+        total_acc_1cm_15deg = 0
+        total_acc_2_30 = 0
+        total_acc_4_60 = 0
+        total_acc_8_120 = 0
+        total_acc_16cm_240deg = 0
+        total_acc_32cm_360deg = 0
+        total_cart_error = 0
+        total_mae = 0
+        total_loss = 0
+        total_exp = 0
+        total_angle_error = 0
+        total_loss_sec = 0
+        normal_arc = []
+        reduce_arc = []
+        for batch_id in range(num_batches):
+            # if batch_id == 0:
+            #     if feed_dict is None:
+            #         feed_dict = {}
+            #     # print the arc if we're on batch 0
+            #     feed_dict['print_arc'] = self.print_arc
+            # elif batch_id == 1 and feed_dict is not None and 'print_arc' in feed_dict:
+            #     # remove the print arc tensor if we're on batch 1
+            #     feed_dict.pop('print_arc', None)
+            if self.fixed_arc is None:
+                acc, acc_5_7_5, acc_1_15, acc_2_30, acc_4_60, acc_8_120, acc_16_240, acc_32_360, cart_error, angle_error, mse, mae, loss_sec = sess.run(
+                    [acc_op, acc_op_5mm_7_5deg, acc_op_1cm_15deg, acc_op_2_30, acc_op_4_60, acc_op_8_120, acc_op_16cm_240deg, acc_op_32cm_360deg, cart_op, ang_er_op, loss_op, mae_op, loss_secondary_op], feed_dict=feed_dict)
+            else:
+                acc, acc_5_7_5, acc_1_15, acc_2_30, acc_4_60, acc_8_120, acc_16_240, acc_32_360, cart_error, angle_error, mse, mae, loss_sec = sess.run(
+                    [acc_op, acc_op_5mm_7_5deg, acc_op_1cm_15deg, acc_op_2_30, acc_op_4_60, acc_op_8_120, acc_op_16cm_240deg, acc_op_32cm_360deg, cart_op, ang_er_op, loss_op, mae_op, loss_secondary_op], feed_dict=feed_dict)
+            total_acc += acc
+            total_acc_5mm_7_5deg += acc_5_7_5
+            total_acc_1cm_15deg += acc_1_15
+            total_acc_2_30 += acc_2_30
+            total_acc_4_60 += acc_4_60
+            total_acc_8_120 += acc_8_120
+            total_acc_16cm_240deg += acc_16_240
+            total_acc_32cm_360deg += acc_32_360
+            total_cart_error += cart_error
+            total_angle_error += angle_error
+            total_loss += mse
+            total_mae += mae
+            total_loss_sec += loss_sec
+            total_exp += self.eval_batch_size
+            if verbose:
+                sys.stdout.write(
+                    "\r{:<5d}/{:>5d}".format(total_acc, total_exp))
+        if verbose:
+            print("")
+        print("{}_accuracy: {:<6.4f}".format(
+            eval_set, float(total_acc) / total_exp))
+        print("{}_accuracy_5mm_7_5deg: {:<6.4f}".format(
+            eval_set, float(total_acc_5mm_7_5deg) / total_exp))
+        print("{}_accuracy_1cm_15deg: {:<6.4f}".format(
+            eval_set, float(total_acc_1cm_15deg) / total_exp))
+        print("{}_accuracy_2cm_30deg: {:<6.4f}".format(
+            eval_set, float(total_acc_2_30) / total_exp))
+        print("{}_accuracy_4cm_60deg: {:<6.4f}".format(
+            eval_set, float(total_acc_4_60) / total_exp))
+        print("{}_accuracy_8cm_120deg: {:<6.4f}".format(
+            eval_set, float(total_acc_8_120) / total_exp))
+        print("{}_accuracy_16cm_240deg: {:<6.4f}".format(
+            eval_set, float(total_acc_16cm_240deg) / total_exp))
+        print("{}_accuracy_32cm_360deg: {:<6.4f}".format(
+            eval_set, float(total_acc_32cm_360deg) / total_exp))
+        if self.rotation_only is False and self.stacking_reward is False:
+            print("{}_cart_error: {:<6.4f}".format(
+                eval_set, float(total_cart_error) / num_batches))
+        if self.translation_only is False and self.stacking_reward is False:
+            print("{}_angle_error: {:<6.4f}".format(
+                eval_set, float(total_angle_error) / num_batches))
+        print("{}_loss_1: {:<6.4f}".format(
+            eval_set, float(total_loss) / num_batches))
+        print("{}_loss_2: {:<6.4f}".format(
+            eval_set, float(total_loss_sec) / num_batches))
+        print("{}_mae: {:<6.4f}".format(
+            eval_set, float(total_mae) / num_batches))
+        if self.fixed_arc is None:
+            print(eval_set, end=" ")
+            print('Eval Architecture:')
+            # print(np.reshape(normal_arc, [-1]))
+            # print(np.reshape(reduce_arc, [-1]))
+            # self.global_step = tf.Print(self.global_step, [self.normal_arc, self.reduce_arc], 'connect_controller(): [normal_arc, reduce_arc]: ', summarize=20)
+        if os.path.exists(csvfile):
+            file_mode = 'a'
+        else:
+            file_mode = 'w+'
+        with open(csvfile, file_mode) as fp:
+            fp.write("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format(
+                total_acc, total_acc_5mm_7_5deg, total_acc_1cm_15deg, total_acc_2_30, total_acc_4_60, total_acc_8_120, total_acc_16cm_240deg, total_acc_32cm_360deg, total_loss, total_mae, total_angle_error, total_cart_error, total_loss_sec))
+
+    # override
+    def _build_train(self):
+        print("-" * 80)
+        print("Build train graph")
+        # print("xtrshape-----------------------",self.x_train.shape)
+        logits = self._model(self.x_train, is_training=True)
+        # tf.Print(logits,[tf.shape(logits),"-----------log"])
+        # print("ytrshape-----------", self.y_train)
+        if self.dataset == "stacking":
+            log_probs = tf.nn.sigmoid(logits)
+            if self.use_msle is False:
+                self.loss = tf.losses.mean_squared_error(
+                    labels=self.y_train, predictions=log_probs)
+                self.loss_secondary = tf.reduce_mean(keras.losses.msle(
+                    self.y_train, log_probs))
+            else:
+                self.loss = tf.reduce_mean(keras.losses.msle(
+                    self.y_train, log_probs))
+                self.loss_secondary = tf.losses.mean_squared_error(
+                    labels=self.y_train, predictions=log_probs)
+        else:
+            activation_fn = tf.nn.sparse_softmax_cross_entropy_with_logits
+            log_probs = activation_fn(
+                logits=logits, labels=self.y_train)
+            self.loss = tf.reduce_mean(log_probs)
+
+        if self.use_aux_heads:
+            if self.dataset == "stacking":
+                # Check
+                log_probs = tf.losses.mean_squared_error(
+                    labels=self.y_train, predictions=log_probs)
+            else:
+                log_probs = activation_fn(
+                    logits=self.aux_logits, labels=self.y_train)
+            self.aux_loss = tf.reduce_mean(log_probs)
+            train_loss = self.loss + 0.4 * self.aux_loss
+        else:
+            train_loss = self.loss
+
+        if self.dataset == "stacking":
+            cast_type = tf.to_float
+        else:
+            cast_type = tf.to_int32
+
+        if self.dataset == "stacking":
+            self.train_preds = tf.nn.sigmoid(logits)
+            self.train_acc = grasp_metrics.grasp_acc(
+                self.y_train, self.train_preds)
+            print("train_acc--------------", self.train_acc)
+            self.train_acc = self.train_acc
+            self.train_acc = tf.reduce_mean(self.train_acc)
+
+            self.train_acc_5mm_7_5deg = grasp_metrics.grasp_acc_5mm_7_5deg(
+                self.y_train, self.train_preds)
+            self.train_acc_5mm_7_5deg = tf.reduce_mean(self.train_acc_5mm_7_5deg)
+
+            self.train_acc_1cm_15deg = grasp_metrics.grasp_acc_1cm_15deg(
+                self.y_train, self.train_preds)
+            self.train_acc_1cm_15deg = tf.reduce_mean(self.train_acc_1cm_15deg)
+
+            self.train_acc_2cm_30deg = grasp_metrics.grasp_acc_2cm_30deg(
+                self.y_train, self.train_preds)
+            self.train_acc_2cm_30deg = tf.reduce_mean(self.train_acc_2cm_30deg)
+
+            self.train_acc_4cm_60deg = grasp_metrics.grasp_acc_4cm_60deg(
+                self.y_train, self.train_preds)
+            self.train_acc_4cm_60deg = tf.reduce_mean(self.train_acc_4cm_60deg)
+
+            self.train_acc_8cm_120deg = grasp_metrics.grasp_acc_8cm_120deg(
+                self.y_train, self.train_preds)
+            self.train_acc_8cm_120deg = tf.reduce_mean(self.train_acc_8cm_120deg)
+
+            self.train_acc_16cm_240deg = grasp_metrics.grasp_acc_16cm_240deg(
+                self.y_train, self.train_preds)
+            self.train_acc_16cm_240deg = tf.reduce_mean(self.train_acc_16cm_240deg)
+
+            self.train_acc_32cm_360deg = grasp_metrics.grasp_acc_32cm_360deg(
+                self.y_train, self.train_preds)
+            self.train_acc_32cm_360deg = tf.reduce_mean(self.train_acc_32cm_360deg)
+
+            self.train_cart_error = grasp_metrics.cart_error(
+                self.y_train, self.train_preds)
+            if self.rotation_only is True or self.stacking_reward is True:
+                self.train_cart_error = tf.zeros([1])
+            else:
+                self.train_cart_error = tf.reduce_mean(self.train_cart_error)
+            if self.translation_only is True or self.stacking_reward is True:
+                self.train_angle_error = tf.zeros([1])
+            else:
+                self.train_angle_error = grasp_metrics.angle_error(
+                    self.y_train, self.train_preds)
+                self.train_angle_error = tf.reduce_mean(self.train_angle_error)
+            self.train_mae = tf.metrics.mean_absolute_error(
+                self.y_train, self.train_preds)
+            self.train_mae = tf.reduce_mean(self.train_mae)
+
+        else:
+            self.train_preds = tf.argmax(logits, axis=1)
+            self.train_preds = cast_type(self.train_preds)
+            # tf.Print(self.train_preds,[tf.shape(self.train_preds),"trainpreds----"])
+            # tf.Print(self.y_train,[tf.shape(self.y_train),"ytra==-------------"])
+            self.train_acc = tf.equal(self.train_preds, self.y_train)
+            self.train_acc = cast_type(self.train_acc)
+            self.train_acc = tf.reduce_mean(self.train_acc)
+            self.train_cart_error = tf.zeros([1])
+            self.train_angle_error = tf.zeros([1])
+            self.train_mae = tf.zeros([1])
+
+        tf_variables = [
+            var for var in tf.trainable_variables() if (
+                var.name.startswith(self.name) and "aux_head" not in var.name)]
+        self.num_vars = count_model_params(tf_variables)
+        print("Model has {0} params".format(self.num_vars))
+
+        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
+            train_loss,
+            tf_variables,
+            self.global_step,
+            clip_mode=self.clip_mode,
+            grad_bound=self.grad_bound,
+            l2_reg=self.l2_reg,
+            lr_init=self.lr_init,
+            lr_dec_start=self.lr_dec_start,
+            lr_dec_every=self.lr_dec_every,
+            lr_dec_rate=self.lr_dec_rate,
+            lr_cosine=self.lr_cosine,
+            lr_max=self.lr_max,
+            lr_min=self.lr_min,
+            lr_T_0=self.lr_T_0,
+            lr_T_mul=self.lr_T_mul,
+            num_train_batches=self.num_train_batches,
+            optim_algo=self.optim_algo,
+            sync_replicas=self.sync_replicas,
+            num_aggregate=self.num_aggregate,
+            num_replicas=self.num_replicas)
+
+    # override
+    def _build_valid(self):
+        if self.x_valid is not None:
+            print("-" * 80)
+            print("Build valid graph")
+            logits = self._model(
+                self.x_valid, is_training=True, reuse=True)
+            if self.dataset == "stacking":
+                logits = tf.nn.sigmoid(logits)
+                cast_type = tf.to_float
+                self.valid_preds = logits
+                self.valid_acc = grasp_metrics.grasp_acc(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc = tf.reduce_sum(self.valid_acc)
+
+                self.valid_acc_5mm_7_5deg = grasp_metrics.grasp_acc_5mm_7_5deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_5mm_7_5deg = tf.reduce_sum(self.valid_acc_5mm_7_5deg)
+
+                self.valid_acc_1cm_15deg = grasp_metrics.grasp_acc_1cm_15deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_1cm_15deg = tf.reduce_sum(self.valid_acc_1cm_15deg)
+
+                self.valid_acc_2cm_30deg = grasp_metrics.grasp_acc_2cm_30deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_2cm_30deg = tf.reduce_sum(self.valid_acc_2cm_30deg)
+
+                self.valid_acc_4cm_60deg = grasp_metrics.grasp_acc_4cm_60deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_4cm_60deg = tf.reduce_sum(self.valid_acc_4cm_60deg)
+
+                self.valid_acc_8cm_120deg = grasp_metrics.grasp_acc_8cm_120deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_8cm_120deg = tf.reduce_sum(self.valid_acc_8cm_120deg)
+
+                self.valid_acc_16cm_240deg = grasp_metrics.grasp_acc_16cm_240deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_16cm_240deg = tf.reduce_sum(self.valid_acc_16cm_240deg)
+
+                self.valid_acc_32cm_360deg = grasp_metrics.grasp_acc_32cm_360deg(
+                    self.y_valid, self.valid_preds)
+                self.valid_acc_32cm_360deg = tf.reduce_sum(self.valid_acc_32cm_360deg)
+
+                if self.use_msle is False:
+                    self.valid_loss = tf.losses.mean_squared_error(
+                        labels=self.y_valid, predictions=self.valid_preds)
+                    self.valid_loss_secondary = tf.reduce_mean(keras.losses.msle(
+                        self.y_valid, self.valid_preds))
+                else:
+                    self.valid_loss = tf.reduce_mean(keras.losses.msle(
+                        self.y_valid, self.valid_preds))
+                    self.valid_loss_secondary = tf.losses.mean_squared_error(
+                        labels=self.y_valid, predictions=self.valid_preds)
+
+                self.valid_cart_error = grasp_metrics.cart_error(
+                  self.y_valid, self.valid_preds)
+                if self.rotation_only is True or self.stacking_reward is True:
+                    self.valid_cart_error = tf.zeros([1])
+                else:
+                    self.valid_cart_error = tf.reduce_mean(self.valid_cart_error)
+                if self.translation_only is True or self.stacking_reward is True:
+                    self.valid_angle_error = tf.zeros([1])
+                else:
+                    self.valid_angle_error = grasp_metrics.angle_error(
+                        self.y_valid, self.valid_preds)
+                    self.valid_angle_error = tf.reduce_mean(self.valid_angle_error)
+                self.valid_mae = tf.metrics.mean_absolute_error(
+                    self.y_valid, self.valid_preds)
+                self.valid_mae = tf.reduce_mean(self.valid_mae)
+
+            else:
+                cast_type = tf.to_int32
+                self.valid_preds = tf.argmax(logits, axis=1)
+                self.valid_preds = cast_type(self.valid_preds)
+                self.valid_acc = tf.equal(self.valid_preds, self.y_valid)
+                self.valid_acc = cast_type(self.valid_acc)
+                self.valid_acc = tf.reduce_sum(self.valid_acc)
+
+    # override
+    def _build_test(self):
+        print("-" * 80)
+        print("Build test graph")
+        logits = self._model(self.x_test, is_training=False, reuse=True)
+        if self.dataset == "stacking":
+            logits = tf.nn.sigmoid(logits)
+            cast_type = tf.to_float
+            self.test_preds = logits
+            self.test_acc = grasp_metrics.grasp_acc(
+                self.y_test, self.test_preds)
+            self.test_acc = tf.reduce_sum(self.test_acc)
+
+            self.test_acc_5mm_7_5deg = grasp_metrics.grasp_acc_5mm_7_5deg(
+                self.y_test, self.test_preds)
+            self.test_acc_5mm_7_5deg = tf.reduce_sum(self.test_acc_5mm_7_5deg)
+
+            self.test_acc_1cm_15deg = grasp_metrics.grasp_acc_1cm_15deg(
+                self.y_test, self.test_preds)
+            self.test_acc_1cm_15deg = tf.reduce_sum(self.test_acc_1cm_15deg)
+
+            self.test_acc_2cm_30deg = grasp_metrics.grasp_acc_2cm_30deg(
+                    self.y_test, self.test_preds)
+            self.test_acc_2cm_30deg = tf.reduce_sum(self.test_acc_2cm_30deg)
+
+            self.test_acc_4cm_60deg = grasp_metrics.grasp_acc_4cm_60deg(
+                self.y_test, self.test_preds)
+            self.test_acc_4cm_60deg = tf.reduce_sum(self.test_acc_4cm_60deg)
+
+            self.test_acc_8cm_120deg = grasp_metrics.grasp_acc_8cm_120deg(
+                self.y_test, self.test_preds)
+            self.test_acc_8cm_120deg = tf.reduce_sum(self.test_acc_8cm_120deg)
+
+            self.test_acc_16cm_240deg = grasp_metrics.grasp_acc_16cm_240deg(
+                self.y_test, self.test_preds)
+            self.test_acc_16cm_240deg = tf.reduce_sum(self.test_acc_16cm_240deg)
+
+            self.test_acc_32cm_360deg = grasp_metrics.grasp_acc_32cm_360deg(
+                self.y_test, self.test_preds)
+            self.test_acc_32cm_360deg = tf.reduce_sum(self.test_acc_32cm_360deg)
+
+            self.test_cart_error = grasp_metrics.cart_error(
+                self.y_test, self.test_preds)
+            if self.rotation_only is True or self.stacking_reward is True:
+                self.test_cart_error = tf.zeros([1])
+            else:
+                self.test_cart_error = tf.reduce_mean(self.test_cart_error)
+            if self.translation_only is True or self.stacking_reward is True:
+                self.test_angle_error = tf.zeros([1])
+            else:
+                self.test_angle_error = grasp_metrics.angle_error(
+                    self.y_test, self.test_preds)
+                self.test_angle_error = tf.reduce_mean(self.test_angle_error)
+            self.test_mae = tf.metrics.mean_absolute_error(
+                self.y_test, self.test_preds)
+            self.test_mae = tf.reduce_mean(self.test_mae)
+            if self.use_msle is False:
+                self.test_loss = tf.losses.mean_squared_error(
+                        labels=self.y_test, predictions=self.test_preds)
+                self.test_loss_secondary = tf.reduce_mean(keras.losses.msle(
+                        self.y_test, self.test_preds))
+            else:
+                self.test_loss = tf.reduce_mean(keras.losses.msle(
+                    self.y_test, self.test_preds))
+                self.test_loss_secondary = tf.losses.mean_squared_error(
+                    labels=self.y_test, predictions=self.test_preds)
+
+        else:
+            cast_type = tf.to_int32
+            self.test_preds = tf.argmax(logits, axis=1)
+            self.test_preds = cast_type(self.test_preds)
+            self.test_acc = tf.equal(self.test_preds, self.y_test)
+            self.test_acc = cast_type(self.test_acc)
+            self.test_acc = tf.reduce_sum(self.test_acc)
+
+    # override
+    def build_valid_rl(self, shuffle=False):
+        print("-" * 80)
+        print("Build valid graph on shuffled data")
+        if self.dataset == "stacking":
+            with tf.device("/cpu:0"):
+                if not shuffle:
+                    self.x_valid_shuffle, self.y_valid_shuffle = self.x_valid, self.y_valid
+                else:
+                    raise NotImplementedError(
+                        'This portion of the code is not correctly implemented, '
+                        'so it must be fixed before running it. '
+                        'see models.py::__init__() for reference code using the '
+                        'CostarBlockStackingSequence().')
+                    data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_15']
+                    label_features = ['grasp_goal_xyz_aaxyz_nsc_8']
+                    validation_shuffle_generator = CostarBlockStackingSequence(
+                        self.validation_data, batch_size=self.batch_size, verbose=0,
+                        label_features_to_extract=label_features,
+                        data_features_to_extract=data_features, output_shape=self.image_shape, shuffle=True)
+                    validation_enqueuer = OrderedEnqueuer(
+                                  validation_generator,
+                                  use_multiprocessing=False,
+                                  shuffle=True)
+                    validation_enqueuer.start(workers=10, max_queue_size=100)
+
+                    def validation_generator(): return iter(train_enqueuer.get())
+                    validation_dataset = Dataset.from_generator(validation_generator, (tf.float32, tf.float32), (tf.TensorShape([None, self.image_shape[0], self.image_shape[1], self.data_features_len]), tf.TensorShape([None, None])))
+                    x_valid_shuffle, y_valid_shuffle = validation_dataset.make_one_shot_iterator().get_next()
+
+        else:
+            with tf.device("/cpu:0"):
+                # shuffled valid data: for choosing validation model
+                if not shuffle and self.data_format == "NCHW":
+                    self.images["valid_original"] = np.transpose(
+                        self.images["valid_original"], [0, 3, 1, 2])
+                self.x_valid_shuffle, self.y_valid_shuffle = tf.train.shuffle_batch(
+                    [self.images["valid_original"], self.labels["valid_original"]],
+                    batch_size=self.batch_size,
+                    capacity=25000,
+                    enqueue_many=True,
+                    min_after_dequeue=0,
+                    num_threads=16,
+                    seed=self.seed,
+                    allow_smaller_final_batch=True,
+                )
+
+                def _pre_process(x):
+                    x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
+                    x = tf.random_crop(x, [32, 32, 3], seed=self.seed)
+                    x = tf.image.random_flip_left_right(x, seed=self.seed)
+                    if self.data_format == "NCHW":
+                        x = tf.transpose(x, [2, 0, 1])
+                    return x
+
+                if shuffle:
+                    x_valid_shuffle = tf.map_fn(
+                        _pre_process, x_valid_shuffle, back_prop=False)
+
+        # TODO(ahundt) should is_training really be true here? this looks like a validation step... but it is in the controller so maybe some training does happen...
+        logits = self._model(
+            self.x_valid_shuffle, is_training=True, reuse=True)
+        if self.dataset == "stacking":
+            logits = tf.nn.sigmoid(logits)
+            cast_type = tf.to_float
+            self.valid_shuffle_preds = logits
+            self.valid_shuffle_acc = grasp_metrics.grasp_acc(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc = tf.reduce_sum(self.valid_shuffle_acc)
+
+            self.valid_shuffle_acc_5mm_7_5deg = grasp_metrics.grasp_acc_5mm_7_5deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_5mm_7_5deg = tf.reduce_sum(self.valid_shuffle_acc_5mm_7_5deg)
+
+            self.valid_shuffle_acc_1cm_15deg = grasp_metrics.grasp_acc_1cm_15deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_1cm_15deg = tf.reduce_sum(self.valid_shuffle_acc_1cm_15deg)
+
+            self.valid_shuffle_acc_2cm_30deg = grasp_metrics.grasp_acc_2cm_30deg(
+                    self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_2cm_30deg = tf.reduce_sum(self.valid_shuffle_acc_2cm_30deg)
+
+            self.valid_shuffle_acc_4cm_60deg = grasp_metrics.grasp_acc_4cm_60deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_4cm_60deg = tf.reduce_sum(self.valid_shuffle_acc_4cm_60deg)
+
+            self.valid_shuffle_acc_8cm_120deg = grasp_metrics.grasp_acc_8cm_120deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_8cm_120deg = tf.reduce_sum(self.valid_shuffle_acc_8cm_120deg)
+
+            self.valid_shuffle_acc_16cm_240deg = grasp_metrics.grasp_acc_16cm_240deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_16cm_240deg = tf.reduce_sum(self.valid_shuffle_acc_16cm_240deg)
+
+            self.valid_shuffle_acc_32cm_360deg = grasp_metrics.grasp_acc_32cm_360deg(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_acc_32cm_360deg = tf.reduce_sum(self.valid_shuffle_acc_32cm_360deg)
+
+            if self.use_msle is False:
+                self.valid_shuffle_loss = tf.reduce_mean(tf.losses.mean_squared_error(
+                        labels=self.y_valid_shuffle, predictions=self.valid_shuffle_preds))
+                self.valid_shuffle_loss_secondary = tf.reduce_mean(keras.losses.msle(
+                    self.y_valid_shuffle, self.valid_shuffle_preds))
+            else:
+                self.valid_shuffle_loss = tf.reduce_mean(keras.losses.msle(
+                    self.y_valid_shuffle, self.valid_shuffle_preds))
+                self.valid_shuffle_loss_secondary = tf.losses.mean_squared_error(
+                    labels=self.y_valid_shuffle, predictions=self.valid_shuffle_preds)
+
+            self.valid_shuffle_cart_error = grasp_metrics.cart_error(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            if self.rotation_only is True or self.stacking_reward is True:
+                self.valid_shuffle_cart_error = tf.zeros([1])
+            else:
+                self.valid_shuffle_cart_error = tf.reduce_mean(self.valid_shuffle_cart_error)
+            if self.translation_only is True or self.stacking_reward is True:
+                self.valid_shuffle_angle_error = tf.zeros([1])
+            else:
+                self.valid_shuffle_angle_error = grasp_metrics.angle_error(
+                    self.y_valid_shuffle, self.valid_shuffle_preds)
+                self.valid_shuffle_angle_error = tf.reduce_mean(self.valid_shuffle_angle_error)
+            self.valid_shuffle_mae = tf.metrics.mean_absolute_error(
+                self.y_valid_shuffle, self.valid_shuffle_preds)
+            self.valid_shuffle_mae = tf.reduce_mean(self.valid_shuffle_mae)
+
+        else:
+            cast_type = tf.to_int32
+            self.valid_shuffle_preds = tf.argmax(logits, axis=1)
+            self.valid_shuffle_preds = cast_type(self.valid_shuffle_preds)
+            self.valid_shuffle_acc = tf.equal(self.valid_shuffle_preds, self.y_valid_shuffle)
+            self.valid_shuffle_acc = cast_type(self.valid_shuffle_acc)
+            self.valid_shuffle_acc = tf.reduce_sum(self.valid_shuffle_acc)
+
+    def connect_controller(self, controller_model, verbose=0):
+        if self.fixed_arc is None:
+            sample_arc = controller_model.sample_arc
+            normal_arc, reduce_arc = sample_arc
+            # self.print_arc = tf.Print([0], [normal_arc, reduce_arc], 'connect_controller(): [normal_arc, reduce_arc]: ', summarize=20)
+
+            if verbose:
+                normal_arc = tf.Print(normal_arc, [normal_arc, reduce_arc], 'connect_controller(): [normal_arc, reduce_arc]: ', summarize=20)
+            self.normal_arc = normal_arc
+            self.reduce_arc = reduce_arc
+        else:
+            fixed_arc = np.array([int(x)
+                                  for x in self.fixed_arc.split(" ") if x])
+            self.normal_arc = fixed_arc[:4 * self.num_cells]
+            self.reduce_arc = fixed_arc[4 * self.num_cells:]
+
+        self._build_train()
+        self._build_valid()
+        self._build_test()
diff --git a/enas/cifar10/micro_controller.py b/enas/cifar10/micro_controller.py
index 1ed39b0..c50181e 100644
--- a/enas/cifar10/micro_controller.py
+++ b/enas/cifar10/micro_controller.py
@@ -15,258 +15,301 @@
 
 from tensorflow.python.training import moving_averages
 
+
 class MicroController(Controller):
-  def __init__(self,
-               search_for="both",
-               search_whole_channels=False,
-               num_branches=6,
-               num_cells=6,
-               lstm_size=32,
-               lstm_num_layers=2,
-               lstm_keep_prob=1.0,
-               tanh_constant=None,
-               op_tanh_reduce=1.0,
-               temperature=None,
-               lr_init=1e-3,
-               lr_dec_start=0,
-               lr_dec_every=100,
-               lr_dec_rate=0.9,
-               l2_reg=0,
-               entropy_weight=None,
-               clip_mode=None,
-               grad_bound=None,
-               use_critic=False,
-               bl_dec=0.999,
-               optim_algo="adam",
-               sync_replicas=False,
-               num_aggregate=None,
-               num_replicas=None,
-               name="controller",
-               **kwargs):
-
-    print("-" * 80)
-    print("Building ConvController")
-
-    self.search_for = search_for
-    self.search_whole_channels = search_whole_channels
-    self.num_cells = num_cells
-    self.num_branches = num_branches
-
-    self.lstm_size = lstm_size
-    self.lstm_num_layers = lstm_num_layers
-    self.lstm_keep_prob = lstm_keep_prob
-    self.tanh_constant = tanh_constant
-    self.op_tanh_reduce = op_tanh_reduce
-    self.temperature = temperature
-    self.lr_init = lr_init
-    self.lr_dec_start = lr_dec_start
-    self.lr_dec_every = lr_dec_every
-    self.lr_dec_rate = lr_dec_rate
-    self.l2_reg = l2_reg
-    self.entropy_weight = entropy_weight
-    self.clip_mode = clip_mode
-    self.grad_bound = grad_bound
-    self.use_critic = use_critic
-    self.bl_dec = bl_dec
-
-    self.optim_algo = optim_algo
-    self.sync_replicas = sync_replicas
-    self.num_aggregate = num_aggregate
-    self.num_replicas = num_replicas
-    self.name = name
-
-    self._create_params()
-    arc_seq_1, entropy_1, log_prob_1, c, h = self._build_sampler(use_bias=True)
-    arc_seq_2, entropy_2, log_prob_2, _, _ = self._build_sampler(prev_c=c, prev_h=h)
-    self.sample_arc = (arc_seq_1, arc_seq_2)
-    self.sample_entropy = entropy_1 + entropy_2
-    self.sample_log_prob = log_prob_1 + log_prob_2
-
-  def _create_params(self):
-    initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
-    with tf.variable_scope(self.name, initializer=initializer):
-      with tf.variable_scope("lstm"):
-        self.w_lstm = []
-        for layer_id in range(self.lstm_num_layers):
-          with tf.variable_scope("layer_{}".format(layer_id)):
-            w = tf.get_variable("w", [2 * self.lstm_size, 4 * self.lstm_size])
-            self.w_lstm.append(w)
-
-      self.g_emb = tf.get_variable("g_emb", [1, self.lstm_size])
-      with tf.variable_scope("emb"):
-        self.w_emb = tf.get_variable("w", [self.num_branches, self.lstm_size])
-      with tf.variable_scope("softmax"):
-        self.w_soft = tf.get_variable("w", [self.lstm_size, self.num_branches])
-        b_init = np.array([10.0, 10.0] + [0] * (self.num_branches - 2),
-                          dtype=np.float32)
-        self.b_soft = tf.get_variable(
-          "b", [1, self.num_branches],
-          initializer=tf.constant_initializer(b_init))
-
-        b_soft_no_learn = np.array(
-          [0.25, 0.25] + [-0.25] * (self.num_branches - 2), dtype=np.float32)
-        b_soft_no_learn = np.reshape(b_soft_no_learn, [1, self.num_branches])
-        self.b_soft_no_learn = tf.constant(b_soft_no_learn, dtype=tf.float32)
-
-      with tf.variable_scope("attention"):
-        self.w_attn_1 = tf.get_variable("w_1", [self.lstm_size, self.lstm_size])
-        self.w_attn_2 = tf.get_variable("w_2", [self.lstm_size, self.lstm_size])
-        self.v_attn = tf.get_variable("v", [self.lstm_size, 1])
-
-  def _build_sampler(self, prev_c=None, prev_h=None, use_bias=False):
-    """Build the sampler ops and the log_prob ops."""
-
-    print("-" * 80)
-    print("Build controller sampler")
-
-    anchors = tf.TensorArray(
-      tf.float32, size=self.num_cells + 2, clear_after_read=False)
-    anchors_w_1 = tf.TensorArray(
-      tf.float32, size=self.num_cells + 2, clear_after_read=False)
-    arc_seq = tf.TensorArray(tf.int32, size=self.num_cells * 4)
-    if prev_c is None:
-      assert prev_h is None, "prev_c and prev_h must both be None"
-      prev_c = [tf.zeros([1, self.lstm_size], tf.float32)
-                for _ in range(self.lstm_num_layers)]
-      prev_h = [tf.zeros([1, self.lstm_size], tf.float32)
-                for _ in range(self.lstm_num_layers)]
-    inputs = self.g_emb
-
-    for layer_id in range(2):
-      next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
-      prev_c, prev_h = next_c, next_h
-      anchors = anchors.write(layer_id, tf.zeros_like(next_h[-1]))
-      anchors_w_1 = anchors_w_1.write(
-        layer_id, tf.matmul(next_h[-1], self.w_attn_1))
-
-    def _condition(layer_id, *args):
-      return tf.less(layer_id, self.num_cells + 2)
-
-    def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq,
-              entropy, log_prob):
-      indices = tf.range(0, layer_id, dtype=tf.int32)
-      start_id = 4 * (layer_id - 2)
-      prev_layers = []
-      for i in range(2):  # index_1, index_2
-        next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
-        prev_c, prev_h = next_c, next_h
-        query = anchors_w_1.gather(indices)
-        query = tf.reshape(query, [layer_id, self.lstm_size])
-        query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2))
-        query = tf.matmul(query, self.v_attn)
-        logits = tf.reshape(query, [1, layer_id])
-        if self.temperature is not None:
-          logits /= self.temperature
-        if self.tanh_constant is not None:
-          logits = self.tanh_constant * tf.tanh(logits)
-        index = tf.multinomial(logits, 1)
-        index = tf.to_int32(index)
-        index = tf.reshape(index, [1])
-        arc_seq = arc_seq.write(start_id + 2 * i, index)
-        curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=logits, labels=index)
-        log_prob += curr_log_prob
-        curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits(
-          logits=logits, labels=tf.nn.softmax(logits)))
-        entropy += curr_ent
-        prev_layers.append(anchors.read(tf.reduce_sum(index)))
-        inputs = prev_layers[-1]
-
-      for i in range(2):  # op_1, op_2
-        next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
-        prev_c, prev_h = next_c, next_h
-        logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft
-        if self.temperature is not None:
-          logits /= self.temperature
-        if self.tanh_constant is not None:
-          op_tanh = self.tanh_constant / self.op_tanh_reduce
-          logits = op_tanh * tf.tanh(logits)
-        if use_bias:
-          logits += self.b_soft_no_learn
-        op_id = tf.multinomial(logits, 1)
-        op_id = tf.to_int32(op_id)
-        op_id = tf.reshape(op_id, [1])
-        arc_seq = arc_seq.write(start_id + 2 * i + 1, op_id)
-        curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=logits, labels=op_id)
-        log_prob += curr_log_prob
-        curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits(
-          logits=logits, labels=tf.nn.softmax(logits)))
-        entropy += curr_ent
-        inputs = tf.nn.embedding_lookup(self.w_emb, op_id)
-
-      next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
-      anchors = anchors.write(layer_id, next_h[-1])
-      anchors_w_1 = anchors_w_1.write(layer_id, tf.matmul(next_h[-1], self.w_attn_1))
-      inputs = self.g_emb
-
-      return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1,
-              arc_seq, entropy, log_prob)
-
-    loop_vars = [
-      tf.constant(2, dtype=tf.int32, name="layer_id"),
-      inputs,
-      prev_c,
-      prev_h,
-      anchors,
-      anchors_w_1,
-      arc_seq,
-      tf.constant([0.0], dtype=tf.float32, name="entropy"),
-      tf.constant([0.0], dtype=tf.float32, name="log_prob"),
-    ]
-
-    loop_outputs = tf.while_loop(_condition, _body, loop_vars,
-                                 parallel_iterations=1)
-
-    arc_seq = loop_outputs[-3].stack()
-    arc_seq = tf.reshape(arc_seq, [-1])
-    entropy = tf.reduce_sum(loop_outputs[-2])
-    log_prob = tf.reduce_sum(loop_outputs[-1])
-
-    last_c = loop_outputs[-7]
-    last_h = loop_outputs[-6]
-
-    return arc_seq, entropy, log_prob, last_c, last_h
-
-  def build_trainer(self, child_model):
-    child_model.build_valid_rl()
-    self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) /
-                      tf.to_float(child_model.batch_size))
-    self.reward = self.valid_acc
-
-    if self.entropy_weight is not None:
-      self.reward += self.entropy_weight * self.sample_entropy
-
-    self.sample_log_prob = tf.reduce_sum(self.sample_log_prob)
-    self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False)
-    baseline_update = tf.assign_sub(
-      self.baseline, (1 - self.bl_dec) * (self.baseline - self.reward))
-
-    with tf.control_dependencies([baseline_update]):
-      self.reward = tf.identity(self.reward)
-
-    self.loss = self.sample_log_prob * (self.reward - self.baseline)
-    self.train_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="train_step")
-
-    tf_variables = [var for var in tf.trainable_variables() if var.name.startswith(self.name)]
-    print("-" * 80)
-    for var in tf_variables:
-      print(var)
-
-    self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
-      self.loss,
-      tf_variables,
-      self.train_step,
-      clip_mode=self.clip_mode,
-      grad_bound=self.grad_bound,
-      l2_reg=self.l2_reg,
-      lr_init=self.lr_init,
-      lr_dec_start=self.lr_dec_start,
-      lr_dec_every=self.lr_dec_every,
-      lr_dec_rate=self.lr_dec_rate,
-      optim_algo=self.optim_algo,
-      sync_replicas=self.sync_replicas,
-      num_aggregate=self.num_aggregate,
-      num_replicas=self.num_replicas)
-
-    self.skip_rate = tf.constant(0.0, dtype=tf.float32)
+    def __init__(self,
+                 search_for="both",
+                 search_whole_channels=False,
+                 num_branches=6,
+                 num_cells=6,
+                 lstm_size=32,
+                 lstm_num_layers=2,
+                 lstm_keep_prob=1.0,
+                 tanh_constant=None,
+                 op_tanh_reduce=1.0,
+                 temperature=None,
+                 lr_init=1e-3,
+                 lr_dec_start=0,
+                 lr_dec_every=100,
+                 lr_dec_rate=0.9,
+                 l2_reg=0,
+                 entropy_weight=None,
+                 clip_mode=None,
+                 grad_bound=None,
+                 use_critic=False,
+                 bl_dec=0.999,
+                 optim_algo="adam",
+                 sync_replicas=False,
+                 num_aggregate=None,
+                 num_replicas=None,
+                 name="controller",
+                 max_loss=0,
+                 translation_only=False,
+                 rotation_only=False,
+                 dataset="cifar",
+                 **kwargs):
+
+        print("-" * 80)
+        print("Building ConvController")
+
+        self.search_for = search_for
+        self.search_whole_channels = search_whole_channels
+        self.num_cells = num_cells
+        self.num_branches = num_branches
+
+        self.lstm_size = lstm_size
+        self.lstm_num_layers = lstm_num_layers
+        self.lstm_keep_prob = lstm_keep_prob
+        self.tanh_constant = tanh_constant
+        self.op_tanh_reduce = op_tanh_reduce
+        self.temperature = temperature
+        self.lr_init = lr_init
+        self.lr_dec_start = lr_dec_start
+        self.lr_dec_every = lr_dec_every
+        self.lr_dec_rate = lr_dec_rate
+        self.l2_reg = l2_reg
+        self.entropy_weight = entropy_weight
+        self.clip_mode = clip_mode
+        self.grad_bound = grad_bound
+        self.use_critic = use_critic
+        self.bl_dec = bl_dec
+
+        self.optim_algo = optim_algo
+        self.sync_replicas = sync_replicas
+        self.num_aggregate = num_aggregate
+        self.num_replicas = num_replicas
+        self.name = name
+        self.dataset = dataset
+        self.max_loss = max_loss
+        self.rotation_only = rotation_only
+        self.translation_only = translation_only
+
+        self._create_params()
+        arc_seq_1, entropy_1, log_prob_1, c, h = self._build_sampler(
+            use_bias=True)
+        arc_seq_2, entropy_2, log_prob_2, _, _ = self._build_sampler(
+            prev_c=c, prev_h=h)
+        self.sample_arc = (arc_seq_1, arc_seq_2)
+        self.sample_entropy = entropy_1 + entropy_2
+        self.sample_log_prob = log_prob_1 + log_prob_2
+
+    def _create_params(self):
+        initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
+        with tf.variable_scope(self.name, initializer=initializer):
+            with tf.variable_scope("lstm"):
+                self.w_lstm = []
+                for layer_id in range(self.lstm_num_layers):
+                    with tf.variable_scope("layer_{}".format(layer_id)):
+                        w = tf.get_variable(
+                            "w", [2 * self.lstm_size, 4 * self.lstm_size])
+                        self.w_lstm.append(w)
+
+            self.g_emb = tf.get_variable("g_emb", [1, self.lstm_size])
+            with tf.variable_scope("emb"):
+                self.w_emb = tf.get_variable(
+                    "w", [self.num_branches, self.lstm_size])
+            with tf.variable_scope("softmax"):
+                self.w_soft = tf.get_variable(
+                    "w", [self.lstm_size, self.num_branches])
+                b_init = np.array([10.0, 10.0] + [0] * (self.num_branches - 2),
+                                  dtype=np.float32)
+                self.b_soft = tf.get_variable(
+                    "b", [1, self.num_branches],
+                    initializer=tf.constant_initializer(b_init))
+
+                b_soft_no_learn = np.array(
+                    [0.25, 0.25] + [-0.25] * (self.num_branches - 2), dtype=np.float32)
+                b_soft_no_learn = np.reshape(
+                    b_soft_no_learn, [1, self.num_branches])
+                self.b_soft_no_learn = tf.constant(
+                    b_soft_no_learn, dtype=tf.float32)
+
+            with tf.variable_scope("attention"):
+                self.w_attn_1 = tf.get_variable(
+                    "w_1", [self.lstm_size, self.lstm_size])
+                self.w_attn_2 = tf.get_variable(
+                    "w_2", [self.lstm_size, self.lstm_size])
+                self.v_attn = tf.get_variable("v", [self.lstm_size, 1])
+
+    def _build_sampler(self, prev_c=None, prev_h=None, use_bias=False):
+        """Build the sampler ops and the log_prob ops."""
+
+        print("-" * 80)
+        print("Build controller sampler")
+
+        anchors = tf.TensorArray(
+            tf.float32, size=self.num_cells + 2, clear_after_read=False)
+        anchors_w_1 = tf.TensorArray(
+            tf.float32, size=self.num_cells + 2, clear_after_read=False)
+        arc_seq = tf.TensorArray(tf.int32, size=self.num_cells * 4)
+        if prev_c is None:
+            assert prev_h is None, "prev_c and prev_h must both be None"
+            prev_c = [tf.zeros([1, self.lstm_size], tf.float32)
+                      for _ in range(self.lstm_num_layers)]
+            prev_h = [tf.zeros([1, self.lstm_size], tf.float32)
+                      for _ in range(self.lstm_num_layers)]
+        inputs = self.g_emb
+
+        for layer_id in range(2):
+            next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
+            prev_c, prev_h = next_c, next_h
+            anchors = anchors.write(layer_id, tf.zeros_like(next_h[-1]))
+            anchors_w_1 = anchors_w_1.write(
+                layer_id, tf.matmul(next_h[-1], self.w_attn_1))
+
+        def _condition(layer_id, *args):
+            return tf.less(layer_id, self.num_cells + 2)
+
+        def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq,
+                  entropy, log_prob):
+            indices = tf.range(0, layer_id, dtype=tf.int32)
+            start_id = 4 * (layer_id - 2)
+            prev_layers = []
+            for i in range(2):  # index_1, index_2
+                next_c, next_h = stack_lstm(
+                    inputs, prev_c, prev_h, self.w_lstm)
+                prev_c, prev_h = next_c, next_h
+                query = anchors_w_1.gather(indices)
+                query = tf.reshape(query, [layer_id, self.lstm_size])
+                query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2))
+                query = tf.matmul(query, self.v_attn)
+                logits = tf.reshape(query, [1, layer_id])
+                if self.temperature is not None:
+                    logits /= self.temperature
+                if self.tanh_constant is not None:
+                    logits = self.tanh_constant * tf.tanh(logits)
+                index = tf.multinomial(logits, 1)
+                index = tf.to_int32(index)
+                index = tf.reshape(index, [1])
+                arc_seq = arc_seq.write(start_id + 2 * i, index)
+                curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                    logits=logits, labels=index)
+                log_prob += curr_log_prob
+                curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits(
+                    logits=logits, labels=tf.nn.softmax(logits)))
+                entropy += curr_ent
+                prev_layers.append(anchors.read(tf.reduce_sum(index)))
+                inputs = prev_layers[-1]
+
+            for i in range(2):  # op_1, op_2
+                next_c, next_h = stack_lstm(
+                    inputs, prev_c, prev_h, self.w_lstm)
+                prev_c, prev_h = next_c, next_h
+                logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft
+                if self.temperature is not None:
+                    logits /= self.temperature
+                if self.tanh_constant is not None:
+                    op_tanh = self.tanh_constant / self.op_tanh_reduce
+                    logits = op_tanh * tf.tanh(logits)
+                if use_bias:
+                    logits += self.b_soft_no_learn
+                op_id = tf.multinomial(logits, 1)
+                op_id = tf.to_int32(op_id)
+                op_id = tf.reshape(op_id, [1])
+                arc_seq = arc_seq.write(start_id + 2 * i + 1, op_id)
+                curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                    logits=logits, labels=op_id)
+                log_prob += curr_log_prob
+                curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits(
+                    logits=logits, labels=tf.nn.softmax(logits)))
+                entropy += curr_ent
+                inputs = tf.nn.embedding_lookup(self.w_emb, op_id)
+
+            next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm)
+            anchors = anchors.write(layer_id, next_h[-1])
+            anchors_w_1 = anchors_w_1.write(
+                layer_id, tf.matmul(next_h[-1], self.w_attn_1))
+            inputs = self.g_emb
+
+            return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1,
+                    arc_seq, entropy, log_prob)
+
+        loop_vars = [
+            tf.constant(2, dtype=tf.int32, name="layer_id"),
+            inputs,
+            prev_c,
+            prev_h,
+            anchors,
+            anchors_w_1,
+            arc_seq,
+            tf.constant([0.0], dtype=tf.float32, name="entropy"),
+            tf.constant([0.0], dtype=tf.float32, name="log_prob"),
+        ]
+
+        loop_outputs = tf.while_loop(_condition, _body, loop_vars,
+                                     parallel_iterations=1)
+
+        arc_seq = loop_outputs[-3].stack()
+        arc_seq = tf.reshape(arc_seq, [-1])
+        entropy = tf.reduce_sum(loop_outputs[-2])
+        log_prob = tf.reduce_sum(loop_outputs[-1])
+
+        last_c = loop_outputs[-7]
+        last_h = loop_outputs[-6]
+
+        return arc_seq, entropy, log_prob, last_c, last_h
+
+    def build_trainer(self, child_model):
+        child_model.build_valid_rl()
+        self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) /
+                          tf.to_float(child_model.batch_size))
+        if self.dataset == "stacking":
+            # rewards like mse should grow fast as the distance from 0 shrinks,
+            # since the possible improvement gets smaller as you get closer to the exact goal pose
+            # use epsilon to avoid dividing by 0
+            epsilon = 1e-12
+            self.reward = 1 / tf.maximum(tf.abs(child_model.valid_shuffle_loss), epsilon)
+            # previous reward which sort of worked:
+            # self.reward = self.max_loss-child_model.valid_shuffle_loss
+            self.mse = child_model.valid_shuffle_loss
+            self.mae = child_model.valid_shuffle_mae
+            self.angle_error = child_model.valid_shuffle_angle_error
+            self.cart_error = child_model.valid_shuffle_cart_error
+
+        else:
+            self.reward = self.valid_acc
+            self.mse = tf.zeros([1])
+            self.mae = tf.zeros([1])
+            self.angle_error = tf.zeros([1])
+            self.cart_error = tf.zeros([1])
+
+        if self.entropy_weight is not None:
+            self.reward += self.entropy_weight * self.sample_entropy
+
+        self.sample_log_prob_ = self.sample_log_prob
+
+        self.sample_log_prob = tf.reduce_sum(self.sample_log_prob)
+        self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False)
+        baseline_update = tf.assign_sub(
+            self.baseline, (1 - self.bl_dec) * (self.baseline - self.reward))
+
+        with tf.control_dependencies([baseline_update]):
+            self.reward = tf.identity(self.reward)
+
+        self.loss = self.sample_log_prob * (self.reward - self.baseline)
+        self.train_step = tf.Variable(
+            0, dtype=tf.int32, trainable=False, name="train_step")
+
+        tf_variables = [var for var in tf.trainable_variables()
+                        if var.name.startswith(self.name)]
+        print("-" * 80)
+        for var in tf_variables:
+            print(var)
+
+        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
+            self.loss,
+            tf_variables,
+            self.train_step,
+            clip_mode=self.clip_mode,
+            grad_bound=self.grad_bound,
+            l2_reg=self.l2_reg,
+            lr_init=self.lr_init,
+            lr_dec_start=self.lr_dec_start,
+            lr_dec_every=self.lr_dec_every,
+            lr_dec_rate=self.lr_dec_rate,
+            optim_algo=self.optim_algo,
+            sync_replicas=self.sync_replicas,
+            num_aggregate=self.num_aggregate,
+            num_replicas=self.num_replicas)
+
+        self.skip_rate = tf.constant(0.0, dtype=tf.float32)
diff --git a/enas/cifar10/models.py b/enas/cifar10/models.py
index 9e31587..5479130 100644
--- a/enas/cifar10/models.py
+++ b/enas/cifar10/models.py
@@ -9,283 +9,480 @@
 from enas.cifar10.image_ops import batch_norm
 from enas.cifar10.image_ops import relu
 from enas.cifar10.image_ops import max_pool
-from enas.cifar10.image_ops import global_avg_pool
+from enas.cifar10.image_ops import global_max_pool
 
 from enas.utils import count_model_params
 from enas.utils import get_train_ops
 
+from block_stacking_reader import CostarBlockStackingSequence
+from keras.utils import OrderedEnqueuer
+import glob
+
 
 class Model(object):
-  def __init__(self,
-               images,
-               labels,
-               cutout_size=None,
-               batch_size=32,
-               eval_batch_size=100,
-               clip_mode=None,
-               grad_bound=None,
-               l2_reg=1e-4,
-               lr_init=0.1,
-               lr_dec_start=0,
-               lr_dec_every=100,
-               lr_dec_rate=0.1,
-               keep_prob=1.0,
-               optim_algo=None,
-               sync_replicas=False,
-               num_aggregate=None,
-               num_replicas=None,
-               data_format="NHWC",
-               name="generic_model",
-               seed=None,
-              ):
-    """
-    Args:
-      lr_dec_every: number of epochs to decay
-    """
-    print("-" * 80)
-    print("Build model {}".format(name))
-
-    self.cutout_size = cutout_size
-    self.batch_size = batch_size
-    self.eval_batch_size = eval_batch_size
-    self.clip_mode = clip_mode
-    self.grad_bound = grad_bound
-    self.l2_reg = l2_reg
-    self.lr_init = lr_init
-    self.lr_dec_start = lr_dec_start
-    self.lr_dec_rate = lr_dec_rate
-    self.keep_prob = keep_prob
-    self.optim_algo = optim_algo
-    self.sync_replicas = sync_replicas
-    self.num_aggregate = num_aggregate
-    self.num_replicas = num_replicas
-    self.data_format = data_format
-    self.name = name
-    self.seed = seed
-
-    self.global_step = None
-    self.valid_acc = None
-    self.test_acc = None
-    print("Build data ops")
-    with tf.device("/cpu:0"):
-      # training data
-      self.num_train_examples = np.shape(images["train"])[0]
-      self.num_train_batches = (
-        self.num_train_examples + self.batch_size - 1) // self.batch_size
-      x_train, y_train = tf.train.shuffle_batch(
-        [images["train"], labels["train"]],
-        batch_size=self.batch_size,
-        capacity=50000,
-        enqueue_many=True,
-        min_after_dequeue=0,
-        num_threads=16,
-        seed=self.seed,
-        allow_smaller_final_batch=True,
-      )
-      self.lr_dec_every = lr_dec_every * self.num_train_batches
-
-      def _pre_process(x):
-        print("prep shape ",x.get_shape())
-        dims = list(x.get_shape())
-        dim = max(dims)
-        x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
-        #x = tf.random_crop(x, [32, 32, 3], seed=self.seed)
-        x = tf.random_crop(x, dims, seed=self.seed)
-        x = tf.image.random_flip_left_right(x, seed=self.seed)
-        if self.cutout_size is not None:
-          mask = tf.ones([self.cutout_size, self.cutout_size], dtype=tf.int32)
-          start = tf.random_uniform([2], minval=0, maxval=dim, dtype=tf.int32)
-          mask = tf.pad(mask, [[self.cutout_size + start[0], dim - start[0]],
-                               [self.cutout_size + start[1], dim - start[1]]])
-          mask = mask[self.cutout_size: self.cutout_size + dim,
-                      self.cutout_size: self.cutout_size + dim]
-          mask = tf.reshape(mask, [dim, dim, 1])
-          mask = tf.tile(mask, [1, 1, dims[2]])
-          x = tf.where(tf.equal(mask, 0), x=x, y=tf.zeros_like(x))
-        if self.data_format == "NCHW":
-          x = tf.transpose(x, [2, 0, 1])
-
-        return x
-      self.x_train = tf.map_fn(_pre_process, x_train, back_prop=False)
-      self.y_train = y_train
-
-      # valid data
-      self.x_valid, self.y_valid = None, None
-      if images["valid"] is not None:
-        images["valid_original"] = np.copy(images["valid"])
-        labels["valid_original"] = np.copy(labels["valid"])
-        if self.data_format == "NCHW":
-          images["valid"] = tf.transpose(images["valid"], [0, 3, 1, 2])
-        self.num_valid_examples = np.shape(images["valid"])[0]
-        self.num_valid_batches = (
-          (self.num_valid_examples + self.eval_batch_size - 1)
-          // self.eval_batch_size)
-        self.x_valid, self.y_valid = tf.train.batch(
-          [images["valid"], labels["valid"]],
-          batch_size=self.eval_batch_size,
-          capacity=5000,
-          enqueue_many=True,
-          num_threads=1,
-          allow_smaller_final_batch=True,
-        )
-
-      # test data
-      if self.data_format == "NCHW":
-        images["test"] = tf.transpose(images["test"], [0, 3, 1, 2])
-      self.num_test_examples = np.shape(images["test"])[0]
-      self.num_test_batches = (
-        (self.num_test_examples + self.eval_batch_size - 1)
-        // self.eval_batch_size)
-      self.x_test, self.y_test = tf.train.batch(
-        [images["test"], labels["test"]],
-        batch_size=self.eval_batch_size,
-        capacity=10000,
-        enqueue_many=True,
-        num_threads=1,
-        allow_smaller_final_batch=True,
-      )
-
-    # cache images and labels
-    self.images = images
-    self.labels = labels
-
-  def eval_once(self, sess, eval_set, feed_dict=None, verbose=False):
-    """Expects self.acc and self.global_step to be defined.
-
-    Args:
-      sess: tf.Session() or one of its wrap arounds.
-      feed_dict: can be used to give more information to sess.run().
-      eval_set: "valid" or "test"
-    """
-
-    assert self.global_step is not None
-    global_step = sess.run(self.global_step)
-    print("Eval at {}".format(global_step))
-
-    if eval_set == "valid":
-      assert self.x_valid is not None
-      assert self.valid_acc is not None
-      num_examples = self.num_valid_examples
-      num_batches = self.num_valid_batches
-      acc_op = self.valid_acc
-    elif eval_set == "test":
-      assert self.test_acc is not None
-      num_examples = self.num_test_examples
-      num_batches = self.num_test_batches
-      acc_op = self.test_acc
-    else:
-      raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
-
-    total_acc = 0
-    total_exp = 0
-    for batch_id in range(num_batches):
-      acc = sess.run(acc_op, feed_dict=feed_dict)
-      total_acc += acc
-      total_exp += self.eval_batch_size
-      if verbose:
-        sys.stdout.write("\r{:<5d}/{:>5d}".format(total_acc, total_exp))
-    if verbose:
-      print("")
-    print("{}_accuracy: {:<6.4f}".format(
-      eval_set, float(total_acc) / total_exp))
-
-  def _build_train(self):
-    print("Build train graph")
-    logits = self._model(self.x_train, True)
-    log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      logits=logits, labels=self.y_train)
-    self.loss = tf.reduce_mean(log_probs)
-
-    self.train_preds = tf.argmax(logits, axis=1)
-    self.train_preds = tf.to_int32(self.train_preds)
-    self.train_acc = tf.equal(self.train_preds, self.y_train)
-    self.train_acc = tf.to_int32(self.train_acc)
-    self.train_acc = tf.reduce_sum(self.train_acc)
-
-    tf_variables = [var
-        for var in tf.trainable_variables() if var.name.startswith(self.name)]
-    self.num_vars = count_model_params(tf_variables)
-    print("-" * 80)
-    for var in tf_variables:
-      print(var)
-
-    self.global_step = tf.Variable(
-      0, dtype=tf.int32, trainable=False, name="global_step")
-    self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
-      self.loss,
-      tf_variables,
-      self.global_step,
-      clip_mode=self.clip_mode,
-      grad_bound=self.grad_bound,
-      l2_reg=self.l2_reg,
-      lr_init=self.lr_init,
-      lr_dec_start=self.lr_dec_start,
-      lr_dec_every=self.lr_dec_every,
-      lr_dec_rate=self.lr_dec_rate,
-      optim_algo=self.optim_algo,
-      sync_replicas=self.sync_replicas,
-      num_aggregate=self.num_aggregate,
-      num_replicas=self.num_replicas)
-
-  def _build_valid(self):
-    if self.x_valid is not None:
-      print("-" * 80)
-      print("Build valid graph")
-      logits = self._model(self.x_valid, False, reuse=True)
-      self.valid_preds = tf.argmax(logits, axis=1)
-      self.valid_preds = tf.to_int32(self.valid_preds)
-      self.valid_acc = tf.equal(self.valid_preds, self.y_valid)
-      self.valid_acc = tf.to_int32(self.valid_acc)
-      self.valid_acc = tf.reduce_sum(self.valid_acc)
-
-  def _build_test(self):
-    print("-" * 80)
-    print("Build test graph")
-    logits = self._model(self.x_test, False, reuse=True)
-    self.test_preds = tf.argmax(logits, axis=1)
-    self.test_preds = tf.to_int32(self.test_preds)
-    self.test_acc = tf.equal(self.test_preds, self.y_test)
-    self.test_acc = tf.to_int32(self.test_acc)
-    self.test_acc = tf.reduce_sum(self.test_acc)
-
-  def build_valid_rl(self, shuffle=False):
-    print("-" * 80)
-    print("Build valid graph on shuffled data")
-    with tf.device("/cpu:0"):
-      # shuffled valid data: for choosing validation model
-      if not shuffle and self.data_format == "NCHW":
-        self.images["valid_original"] = np.transpose(
-          self.images["valid_original"], [0, 3, 1, 2])
-      x_valid_shuffle, y_valid_shuffle = tf.train.shuffle_batch(
-        [self.images["valid_original"], self.labels["valid_original"]],
-        batch_size=self.batch_size,
-        capacity=25000,
-        enqueue_many=True,
-        min_after_dequeue=0,
-        num_threads=16,
-        seed=self.seed,
-        allow_smaller_final_batch=True,
-      )
-
-      def _pre_process(x):
-        x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
-        x = tf.random_crop(x, list(x.get_shape()), seed=self.seed)
-        x = tf.image.random_flip_left_right(x, seed=self.seed)
-        if self.data_format == "NCHW":
-          x = tf.transpose(x, [2, 0, 1])
-
-        return x
-
-      if shuffle:
-        x_valid_shuffle = tf.map_fn(_pre_process, x_valid_shuffle,
-                                    back_prop=False)
-
-    logits = self._model(x_valid_shuffle, False, reuse=True)
-    valid_shuffle_preds = tf.argmax(logits, axis=1)
-    valid_shuffle_preds = tf.to_int32(valid_shuffle_preds)
-    self.valid_shuffle_acc = tf.equal(valid_shuffle_preds, y_valid_shuffle)
-    self.valid_shuffle_acc = tf.to_int32(self.valid_shuffle_acc)
-    self.valid_shuffle_acc = tf.reduce_sum(self.valid_shuffle_acc)
-
-  def _model(self, images, is_training, reuse=None):
-    raise NotImplementedError("Abstract method")
+    def __init__(self,
+                 images,
+                 labels,
+                 cutout_size=None,
+                 batch_size=32,
+                 eval_batch_size=32,
+                 clip_mode=None,
+                 grad_bound=None,
+                 l2_reg=1e-4,
+                 lr_init=0.1,
+                 lr_dec_start=0,
+                 lr_dec_every=100,
+                 lr_dec_rate=0.1,
+                 keep_prob=1.0,
+                 optim_algo=None,
+                 sync_replicas=False,
+                 num_aggregate=None,
+                 num_replicas=None,
+                 data_format="NHWC",
+                 name="generic_model",
+                 seed=None,
+                 valid_set_size=32,
+                 image_shape=(32, 32, 3),
+                 translation_only=False,
+                 rotation_only=False,
+                 stacking_reward=False,
+                 use_root=False,
+                 dataset="cifar",
+                 data_base_path="",
+                 one_hot_encoding=False,
+                 random_augmentation=None
+                 ):
+        """
+        Args:
+          lr_dec_every: number of epochs to decay
+        """
+        print("-" * 80)
+        print("Build model {}".format(name))
+
+        self.cutout_size = cutout_size
+        self.batch_size = batch_size
+        # TODO change back to eval_batch size, pass eval_batch_size from arguments
+        self.eval_batch_size = batch_size
+        self.clip_mode = clip_mode
+        self.grad_bound = grad_bound
+        self.l2_reg = l2_reg
+        self.lr_init = lr_init
+        self.lr_dec_start = lr_dec_start
+        self.lr_dec_rate = lr_dec_rate
+        self.keep_prob = keep_prob
+        self.optim_algo = optim_algo
+        self.sync_replicas = sync_replicas
+        self.num_aggregate = num_aggregate
+        self.num_replicas = num_replicas
+        self.data_format = data_format
+        self.name = name
+        self.seed = seed
+        self.dataset = dataset
+        self.valid_set_size = valid_set_size
+        self.image_shape = image_shape
+        self.rotation_only = rotation_only
+        self.translation_only = translation_only
+        self.stacking_reward = stacking_reward
+        self.random_augmentation = random_augmentation
+        self.data_base_path = data_base_path
+        self.use_root = use_root
+        self.one_hot_encoding = one_hot_encoding
+
+        self.global_step = None
+        self.valid_acc = None
+        self.test_acc = None
+        print("Build data ops")
+        with tf.device("/cpu:0"):
+            # training data
+
+            # Support for stacking generator
+            print("dataset----------------------", self.dataset)
+            if self.dataset == "stacking":
+                Dataset = tf.data.Dataset
+                flags = tf.app.flags
+                FLAGS = flags.FLAGS
+                np.random.seed(0)
+                val_test_size = self.valid_set_size
+                if images["path"] != "":
+                    print("datadir------------", images["path"])
+                    file_names = glob.glob(os.path.expanduser(images["path"]))
+                    train_data = file_names[val_test_size*2:]
+                    validation_data = file_names[val_test_size:val_test_size*2]
+                    self.validation_data = validation_data
+                    test_data = file_names[:val_test_size]
+                else:
+                    print("-------Loading train-test-val from txt files-------")
+                    self.data_base_path = os.path.expanduser(self.data_base_path)
+                    with open(self.data_base_path + 'costar_block_stacking_v0.3_success_only_train_files.txt', mode='r') as myfile:
+                        train_data = myfile.read().splitlines()
+                    with open(self.data_base_path + 'costar_block_stacking_v0.3_success_only_test_files.txt', mode='r') as myfile:
+                        test_data = myfile.read().splitlines()
+                    with open(self.data_base_path + 'costar_block_stacking_v0.3_success_only_val_files.txt', mode='r') as myfile:
+                        validation_data = myfile.read().splitlines()     
+                    print(train_data)
+                    # train_data = [self.data_base_path + name for name in train_data]
+                    # test_data = [self.data_base_path + name for name in test_data]
+                    # validation_data = [self.data_base_path + name for name in validation_data]
+                    print(validation_data)
+                # number of images to look at per example
+                # TODO(ahundt) currently there is a bug in one of these calculations, lowering images per example to reduce number of steps per epoch for now.
+                estimated_images_per_example = 2
+                print("valid set size", val_test_size)
+                # TODO(ahundt) fix quick hack to proceed through epochs faster
+                # self.num_train_examples = len(train_data) * self.batch_size * estimated_images_per_example
+                # self.num_train_batches = (self.num_train_examples + self.batch_size - 1) // self.batch_size
+                self.num_train_examples = len(train_data) * estimated_images_per_example
+                self.num_train_batches = (self.num_train_examples + self.batch_size - 1) // self.batch_size
+                # output_shape = (32, 32, 3)
+                # WARNING: IF YOU ARE EDITING THIS CODE, MAKE SURE TO ALSO CHECK micro_controller.py and micro_child.py WHICH ALSO HAS A GENERATOR
+                if self.translation_only is True:
+                    # We've found evidence (but not concluded finally) in hyperopt
+                    # that input of the rotation component actually
+                    # lowers translation accuracy at least in the colored block case
+                    # switch between the two commented lines to go back to the prvious behavior
+                    # data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_15']
+                    # self.data_features_len = 15
+                    data_features = ['image_0_image_n_vec_xyz_nxygrid_12']
+                    self.data_features_len = 12
+                    label_features = ['grasp_goal_xyz_3']
+                    self.num_classes = 3
+                elif self.rotation_only is True:
+                    data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_15']
+                    self.data_features_len = 15
+                    # disabled 2 lines below below because best run 2018_12_2054 was with settings above
+                    # include a normalized xy grid, similar to uber's coordconv
+                    # data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17']
+                    # self.data_features_len = 17
+                    label_features = ['grasp_goal_aaxyz_nsc_5']
+                    self.num_classes = 5
+                elif self.stacking_reward is True:
+                    data_features = ['image_0_image_n_vec_0_vec_n_xyz_aaxyz_nsc_nxygrid_25']
+                    self.data_features_len = 25
+                    label_features = ['stacking_reward']
+                    self.num_classes = 1
+                # elif self.use_root is True:
+                #     data_features = ['current_xyz_aaxyz_nsc_8']
+                #     self.data_features_len = 8
+                #     label_features = ['grasp_goal_xyz_3']
+                #     self.num_classes = 8
+                else:
+                    # original input block
+                    # data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_15']
+                    # include a normalized xy grid, similar to uber's coordconv
+                    data_features = ['image_0_image_n_vec_xyz_aaxyz_nsc_nxygrid_17']
+                    self.data_features_len = 17
+                    label_features = ['grasp_goal_xyz_aaxyz_nsc_8']
+                    self.num_classes = 8
+                if self.one_hot_encoding:
+                    self.data_features_len += 40
+                training_generator = CostarBlockStackingSequence(
+                    train_data, batch_size=batch_size, verbose=0,
+                    label_features_to_extract=label_features,
+                    data_features_to_extract=data_features, output_shape=self.image_shape, shuffle=True,
+                    random_augmentation=self.random_augmentation, one_hot_encoding=self.one_hot_encoding)
+
+                train_enqueuer = OrderedEnqueuer(
+                    training_generator,
+                    use_multiprocessing=False,
+                    shuffle=True)
+                train_enqueuer.start(workers=10, max_queue_size=100)
+
+                def train_generator(): return iter(train_enqueuer.get())
+
+                train_dataset = Dataset.from_generator(train_generator, (tf.float32, tf.float32), (tf.TensorShape(
+                    [None, self.image_shape[0], self.image_shape[1], self.data_features_len]), tf.TensorShape([None, None])))
+                # if self.use_root is True:
+                #     train_dataset = Dataset.from_generator(train_generator, (tf.float32, tf.float32), (tf.TensorShape(
+                #         [None, 2]), tf.TensorShape([None, None])))
+                trainer = train_dataset.make_one_shot_iterator()
+                x_train, y_train = trainer.get_next()
+                # x_train_list = []
+                # x_train_list[0] = np.reshape(x_train[0][0], [-1, self.image_shape[1], self.image_shape[2], 3])
+                # x_train_list[1] = np.reshape(x_train[0][1], [-1, self.image_shape[1], self.image_shape[2], 3])
+                # x_train_list[2] = np.reshape(x_train[0][2],[-1, ])
+                # print("x shape--------------", x_train.shape)
+                print("batch--------------------------",
+                      self.num_train_examples, self.num_train_batches)
+                print("y shape--------------", y_train.shape)
+                self.x_train = x_train
+                self.y_train = y_train
+
+            else:
+                self.num_train_examples = np.shape(images["train"])[0]
+                self.num_classes = 10
+                self.num_train_batches = (
+                    self.num_train_examples + self.batch_size - 1) // self.batch_size
+
+                x_train, y_train = tf.train.shuffle_batch(
+                    [images["train"], labels["train"]],
+                    batch_size=self.batch_size,
+                    capacity=50000,
+                    enqueue_many=True,
+                    min_after_dequeue=0,
+                    num_threads=16,
+                    seed=self.seed,
+                    allow_smaller_final_batch=True,
+                )
+
+                def _pre_process(x):
+                    print("prep shape ", x.get_shape())
+                    dims = list(x.get_shape())
+                    dim = max(dims)
+                    x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
+                    #x = tf.random_crop(x, [32, 32, 3], seed=self.seed)
+                    x = tf.random_crop(x, dims, seed=self.seed)
+                    x = tf.image.random_flip_left_right(x, seed=self.seed)
+                    if self.cutout_size is not None:
+                        mask = tf.ones(
+                            [self.cutout_size, self.cutout_size], dtype=tf.int32)
+                        start = tf.random_uniform(
+                            [2], minval=0, maxval=dim, dtype=tf.int32)
+                        mask = tf.pad(mask, [[self.cutout_size + start[0], dim - start[0]],
+                                             [self.cutout_size + start[1], dim - start[1]]])
+                        mask = mask[self.cutout_size: self.cutout_size + dim,
+                                    self.cutout_size: self.cutout_size + dim]
+                        mask = tf.reshape(mask, [dim, dim, 1])
+                        mask = tf.tile(mask, [1, 1, dims[2]])
+                        x = tf.where(tf.equal(mask, 0), x=x,
+                                     y=tf.zeros_like(x))
+                    if self.data_format == "NCHW":
+                        x = tf.transpose(x, [2, 0, 1])
+
+                    return x
+                self.x_train = tf.map_fn(
+                    _pre_process, x_train, back_prop=False)
+                self.y_train = y_train
+            self.lr_dec_every = lr_dec_every * self.num_train_batches
+
+            # valid data
+            self.x_valid, self.y_valid = None, None
+            if self.dataset == "stacking":
+                # TODO
+                validation_generator = CostarBlockStackingSequence(
+                    validation_data, batch_size=batch_size, verbose=0,
+                    label_features_to_extract=label_features,
+                    data_features_to_extract=data_features, output_shape=self.image_shape, one_hot_encoding=self.one_hot_encoding)
+                validation_enqueuer = OrderedEnqueuer(
+                    validation_generator,
+                    use_multiprocessing=False,
+                    shuffle=True)
+                validation_enqueuer.start(workers=10, max_queue_size=100)
+
+                def validation_generator(): return iter(validation_enqueuer.get())
+                validation_dataset = Dataset.from_generator(validation_generator, (tf.float32, tf.float32), (tf.TensorShape(
+                    [None, self.image_shape[0], self.image_shape[1], self.data_features_len]), tf.TensorShape([None, None])))
+                self.num_valid_examples = len(
+                    validation_data) * self.eval_batch_size * estimated_images_per_example
+                self.num_valid_batches = (
+                    self.num_valid_examples + self.eval_batch_size - 1) // self.eval_batch_size
+                self.x_valid, self.y_valid = validation_dataset.make_one_shot_iterator().get_next()
+                print("x-v........-------------", self.x_valid.shape)
+                if "valid_original" not in images.keys():
+                    images["valid_original"] = np.copy(self.x_valid)
+                    labels["valid_original"] = np.copy(self.y_valid)
+            else:
+                if images["valid"] is not None:
+                    images["valid_original"] = np.copy(images["valid"])
+                    labels["valid_original"] = np.copy(labels["valid"])
+                    if self.data_format == "NCHW":
+                        images["valid"] = tf.transpose(
+                            images["valid"], [0, 3, 1, 2])
+                    self.num_valid_examples = np.shape(images["valid"])[0]
+                    self.num_valid_batches = (
+                        (self.num_valid_examples + self.eval_batch_size - 1)
+                        // self.eval_batch_size)
+                    self.x_valid, self.y_valid = tf.train.batch(
+                        [images["valid"], labels["valid"]],
+                        batch_size=self.eval_batch_size,
+                        capacity=5000,
+                        enqueue_many=True,
+                        num_threads=1,
+                        allow_smaller_final_batch=True,
+                    )
+
+            # test data
+            if self.dataset == "stacking":
+                # TODO
+                test_generator = CostarBlockStackingSequence(
+                    test_data, batch_size=batch_size, verbose=0,
+                    label_features_to_extract=label_features,
+                    data_features_to_extract=data_features, output_shape=self.image_shape, one_hot_encoding=self.one_hot_encoding)
+                test_enqueuer = OrderedEnqueuer(
+                    test_generator,
+                    use_multiprocessing=False,
+                    shuffle=True)
+                test_enqueuer.start(workers=10, max_queue_size=100)
+
+                def test_generator(): return iter(test_enqueuer.get())
+                test_dataset = Dataset.from_generator(test_generator, (tf.float32, tf.float32), (tf.TensorShape(
+                    [None, self.image_shape[0], self.image_shape[1], self.data_features_len]), tf.TensorShape([None, None])))
+                self.num_test_examples = len(
+                    test_data) * self.eval_batch_size * estimated_images_per_example
+                self.num_test_batches = (
+                    self.num_valid_examples + self.eval_batch_size - 1) // self.eval_batch_size
+                self.x_test, self.y_test = test_dataset.make_one_shot_iterator().get_next()
+            else:
+                if self.data_format == "NCHW":
+                    images["test"] = tf.transpose(images["test"], [0, 3, 1, 2])
+                self.num_test_examples = np.shape(images["test"])[0]
+                self.num_test_batches = (
+                    (self.num_test_examples + self.eval_batch_size - 1)
+                    // self.eval_batch_size)
+                self.x_test, self.y_test = tf.train.batch(
+                    [images["test"], labels["test"]],
+                    batch_size=self.eval_batch_size,
+                    capacity=10000,
+                    enqueue_many=True,
+                    num_threads=1,
+                    allow_smaller_final_batch=True,
+                )
+
+        # cache images and labels
+        self.images = images
+        self.labels = labels
+
+    def eval_once(self, sess, eval_set, feed_dict=None, verbose=False):
+        """Expects self.acc and self.global_step to be defined.
+
+        Args:
+          sess: tf.Session() or one of its wrap arounds.
+          feed_dict: can be used to give more information to sess.run().
+          eval_set: "valid" or "test"
+        """
+
+        assert self.global_step is not None
+        global_step = sess.run(self.global_step)
+        print("Eval at {}".format(global_step))
+
+        if eval_set == "valid":
+            assert self.x_valid is not None
+            assert self.valid_acc is not None
+            num_examples = self.num_valid_examples
+            num_batches = self.num_valid_batches
+            acc_op = self.valid_acc
+        elif eval_set == "test":
+            assert self.test_acc is not None
+            num_examples = self.num_test_examples
+            num_batches = self.num_test_batches
+            acc_op = self.test_acc
+        else:
+            raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
+
+        total_acc = 0
+        total_exp = 0
+        for batch_id in range(num_batches):
+            acc = sess.run(acc_op, feed_dict=feed_dict)
+            total_acc += acc
+            total_exp += self.eval_batch_size
+            if verbose:
+                sys.stdout.write(
+                    "\r{:<5d}/{:>5d}".format(total_acc, total_exp))
+        if verbose:
+            print("")
+        print("{}_accuracy: {:<6.4f}".format(
+            eval_set, float(total_acc) / total_exp))
+
+    def _build_train(self):
+        print("Build train graph")
+        logits = self._model(self.x_train, True)
+        log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=self.y_train)
+        self.loss = tf.reduce_mean(log_probs)
+
+        self.train_preds = tf.argmax(logits, axis=1)
+        self.train_preds = tf.to_int32(self.train_preds)
+        self.train_acc = tf.equal(self.train_preds, self.y_train)
+        self.train_acc = tf.to_int32(self.train_acc)
+        self.train_acc = tf.reduce_sum(self.train_acc)
+
+        tf_variables = [var
+                        for var in tf.trainable_variables() if var.name.startswith(self.name)]
+        self.num_vars = count_model_params(tf_variables)
+        print("-" * 80)
+        for var in tf_variables:
+            print(var)
+
+        self.global_step = tf.Variable(
+            0, dtype=tf.int32, trainable=False, name="global_step")
+        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
+            self.loss,
+            tf_variables,
+            self.global_step,
+            clip_mode=self.clip_mode,
+            grad_bound=self.grad_bound,
+            l2_reg=self.l2_reg,
+            lr_init=self.lr_init,
+            lr_dec_start=self.lr_dec_start,
+            lr_dec_every=self.lr_dec_every,
+            lr_dec_rate=self.lr_dec_rate,
+            optim_algo=self.optim_algo,
+            sync_replicas=self.sync_replicas,
+            num_aggregate=self.num_aggregate,
+            num_replicas=self.num_replicas)
+
+    def _build_valid(self):
+        if self.x_valid is not None:
+            print("-" * 80)
+            print("Build valid graph")
+            logits = self._model(self.x_valid, False, reuse=True)
+            self.valid_preds = tf.argmax(logits, axis=1)
+            self.valid_preds = tf.to_int32(self.valid_preds)
+            self.valid_acc = tf.equal(self.valid_preds, self.y_valid)
+            self.valid_acc = tf.to_int32(self.valid_acc)
+            self.valid_acc = tf.reduce_sum(self.valid_acc)
+
+    def _build_test(self):
+        print("-" * 80)
+        print("Build test graph")
+        logits = self._model(self.x_test, False, reuse=True)
+        self.test_preds = tf.argmax(logits, axis=1)
+        self.test_preds = tf.to_int32(self.test_preds)
+        self.test_acc = tf.equal(self.test_preds, self.y_test)
+        self.test_acc = tf.to_int32(self.test_acc)
+        self.test_acc = tf.reduce_sum(self.test_acc)
+
+    def build_valid_rl(self, shuffle=False):
+        print("-" * 80)
+        print("Build valid graph on shuffled data")
+        if self.dataset == "stacking":
+            # TODO
+            x_valid_shuffle, y_valid_shuffle = self.x_valid, self.y_valid
+        else:
+            with tf.device("/cpu:0"):
+                # shuffled valid data: for choosing validation model
+                if not shuffle and self.data_format == "NCHW":
+                    self.images["valid_original"] = np.transpose(
+                        self.images["valid_original"], [0, 3, 1, 2])
+                x_valid_shuffle, y_valid_shuffle = tf.train.shuffle_batch(
+                    [self.images["valid_original"], self.labels["valid_original"]],
+                    batch_size=self.batch_size,
+                    capacity=25000,
+                    enqueue_many=True,
+                    min_after_dequeue=0,
+                    num_threads=16,
+                    seed=self.seed,
+                    allow_smaller_final_batch=True,
+                )
+
+                def _pre_process(x):
+                    x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
+                    x = tf.random_crop(x, list(x.get_shape()), seed=self.seed)
+                    x = tf.image.random_flip_left_right(x, seed=self.seed)
+                    if self.data_format == "NCHW":
+                        x = tf.transpose(x, [2, 0, 1])
+
+                    return x
+
+                if shuffle:
+                    x_valid_shuffle = tf.map_fn(_pre_process, x_valid_shuffle,
+                                                back_prop=False)
+
+        logits = self._model(x_valid_shuffle, False, reuse=True)
+        valid_shuffle_preds = tf.argmax(logits, axis=1)
+        valid_shuffle_preds = tf.to_int32(valid_shuffle_preds)
+        self.valid_shuffle_acc = tf.equal(valid_shuffle_preds, y_valid_shuffle)
+        self.valid_shuffle_acc = tf.to_int32(self.valid_shuffle_acc)
+        self.valid_shuffle_acc = tf.reduce_sum(self.valid_shuffle_acc)
+
+    def _model(self, images, is_training, reuse=None):
+        raise NotImplementedError("Abstract method")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..39cf010
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+pillow
+matplotlib
+scikit-image
+pyquaternion
+keras
diff --git a/scripts/costar_block_stacking_reward_search.sh b/scripts/costar_block_stacking_reward_search.sh
new file mode 100755
index 0000000..bd99961
--- /dev/null
+++ b/scripts/costar_block_stacking_reward_search.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="stacking_outputs_stacking_reward" --batch_size=16 --num_epochs=600 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=10 --child_out_filters=32 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=5 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=1.0 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.0001 --controller_train_every=1 --controller_sync_replicas --controller_num_aggregate=10 --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --data_path="~/.keras/datasets/costar_block_stacking_dataset_v0.2/*success.h5f" --dataset="stacking" --height_img 96 --width_img 96 --stacking_reward --max_loss=2
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_rotation_final.sh b/scripts/costar_block_stacking_rotation_final.sh
new file mode 100755
index 0000000..44b490d
--- /dev/null
+++ b/scripts/costar_block_stacking_rotation_final.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+export PYTHONPATH="$(pwd)"
+
+
+# Epoch 620: Training controller
+#
+# ctrl_step=18570  controller_loss=1494.188 ent=31.84 lr=0.0035 |g|=0.0002   acc=0.6875 bl=62.99 mins=5730.29 rw =111.040000916 mse =0.00900602154434
+# angle_error=0.216985523701
+# mae=0.126344487071
+# --------------------------------------------------------------------------------
+# [1 4 1 1 1 1 1 4 3 1 3 0]
+# [1 2 1 4 2 2 1 1 2 3 1 1]
+# val_acc=1.0000
+# controller_loss=6322.51025391
+# mse=0.00370951113291
+# angle_error=0.104434356093
+# mae=0.126305446029
+# -------------------------
+fixed_arc="1 4 1 1 1 1 1 4 3 1 3 0"
+fixed_arc="$fixed_arc 1 4 1 1 1 1 1 4 3 1 3 0"
+
+python enas/cifar10/main.py \
+  --data_format="NHWC" \
+  --search_for="micro" \
+  --reset_output_dir \
+  --output_dir="2018_09_14_1249_stacking_outputs_rotation_final_with_root_msle" \
+  --batch_size=32 \
+  --num_epochs=630 \
+  --log_every=50 \
+  --eval_every_epochs=10 \
+  --child_fixed_arc="${fixed_arc}" \
+  --child_use_aux_heads \
+  --child_num_layers=10 \
+  --child_out_filters=36 \
+  --child_num_branches=5 \
+  --child_num_cells=3 \
+  --child_keep_prob=0.80 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_l2_reg=2e-4 \
+  --child_lr_cosine \
+  --child_lr_max=1.0 \
+  --child_lr_min=0.0001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --nocontroller_training \
+  --controller_search_whole_channels \
+  --controller_entropy_weight=0.0001 \
+  --controller_train_every=1 \
+  --controller_sync_replicas \
+  --controller_num_aggregate=10 \
+  --controller_train_steps=50 \
+  --controller_lr=0.001 \
+  --controller_tanh_constant=1.50 \
+  --controller_op_tanh_reduce=2.5 \
+  --dataset="stacking" \
+  --height_img 64 \
+  --width_img 64 \
+  --rotation_only \
+  --max_loss=2 \
+  --use_root \
+  --use_msle \
+  --one_hot_encoding \
+  "$@"
+
diff --git a/scripts/costar_block_stacking_rotation_no_root_final.sh b/scripts/costar_block_stacking_rotation_no_root_final.sh
new file mode 100644
index 0000000..15bcce6
--- /dev/null
+++ b/scripts/costar_block_stacking_rotation_no_root_final.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+export PYTHONPATH="$(pwd)"
+# from 2018_09_15_0953_stacking_outputs_rotation_search_no_root_no_msle.txt
+# Epoch 300
+# [1 2 0 1 2 3 1 0 0 3 2 0]                                                                                                                                                   │·························
+# [0 1 1 1 0 2 2 0 1 3 1 3]
+# val_acc=0.9375                                                                                                                                                              │·························
+# controller_loss=3453.15795898                                                                                                                                               │·························
+# mse=0.00610682461411                                                                                                                                                        │·························
+# angle_error=0.122636593878                                                                                                                                                  │·························
+# mae=0.12954197824
+# -------------------------
+fixed_arc="1 2 0 1 2 3 1 0 0 3 2 0"
+fixed_arc="$fixed_arc 0 1 1 1 0 2 2 0 1 3 1 3"
+
+python enas/cifar10/main.py \
+  --data_format="NHWC" \
+  --search_for="micro" \
+  --reset_output_dir \
+  --output_dir="2018_09_17_1723_stacking_outputs_rotation_without_root_msle_final" \
+  --batch_size=64 \
+  --num_epochs=630 \
+  --log_every=50 \
+  --eval_every_epochs=10 \
+  --child_fixed_arc="${fixed_arc}" \
+  --child_use_aux_heads \
+  --child_num_layers=10 \
+  --child_out_filters=36 \
+  --child_num_branches=5 \
+  --child_num_cells=3 \
+  --child_keep_prob=0.80 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_l2_reg=2e-4 \
+  --child_lr_cosine \
+  --child_lr_max=1.0 \
+  --child_lr_min=0.0001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --nocontroller_training \
+  --controller_search_whole_channels \
+  --controller_entropy_weight=0.0001 \
+  --controller_train_every=1 \
+  --controller_sync_replicas \
+  --controller_num_aggregate=10 \
+  --controller_train_steps=50 \
+  --controller_lr=0.001 \
+  --controller_tanh_constant=1.50 \
+  --controller_op_tanh_reduce=2.5 \
+  --dataset="stacking" \
+  --height_img 64 \
+  --width_img 64 \
+  --rotation_only \
+  --max_loss=2 \
+  --use_msle \
+  --one_hot_encoding \
+  "$@"
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_rotation_search.sh b/scripts/costar_block_stacking_rotation_search.sh
new file mode 100755
index 0000000..a4f3dcd
--- /dev/null
+++ b/scripts/costar_block_stacking_rotation_search.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="stacking_outputs_rotation_with_root" --batch_size=16 --num_epochs=640 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=8 --child_out_filters=20 --child_filter_size=5 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=3 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=1.0 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.0001 --controller_train_every=1 --controller_sync_replicas --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --dataset="stacking" --height_img 64 --width_img 64 --rotation_only --max_loss=2 --use_root --one_hot_encoding
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_rotation_search_no_root.sh b/scripts/costar_block_stacking_rotation_search_no_root.sh
new file mode 100755
index 0000000..392a8fb
--- /dev/null
+++ b/scripts/costar_block_stacking_rotation_search_no_root.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="stacking_outputs_rotation_with_root" --batch_size=16 --num_epochs=640 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=8 --child_out_filters=20 --child_filter_size=5 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=3 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=1.0 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.0001 --controller_train_every=1 --controller_sync_replicas --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --dataset="stacking" --height_img 64 --width_img 64 --rotation_only --max_loss=2 --one_hot_encoding
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_search.sh b/scripts/costar_block_stacking_search.sh
new file mode 100755
index 0000000..eb60b19
--- /dev/null
+++ b/scripts/costar_block_stacking_search.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="outputs" --batch_size=16 --num_epochs=600 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=6 --child_out_filters=20 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=3 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=0.5 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.00001 --controller_train_every=1 --controller_sync_replicas --controller_num_aggregate=10 --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --data_path="~/.keras/datasets/costar_block_stacking_dataset_v0.2/*success.h5f" --dataset="stacking" --height_img 128 --width_img 128 --max_loss=5
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_translation_final.sh b/scripts/costar_block_stacking_translation_final.sh
new file mode 100755
index 0000000..20e8d59
--- /dev/null
+++ b/scripts/costar_block_stacking_translation_final.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+export PYTHONPATH="$(pwd)"
+# from 2018_09_09_2230_micro_translation_search_output_stack.txt
+# [0 3 0 2 0 0 1 2 2 4 0 1 0 0 3 0 3 4 2 2]
+# [1 4 1 1 2 3 1 0 2 4 0 1 2 1 4 4 2 1 3 2]
+# val_acc=0.0625
+# controller_loss=126626.984375
+# mse=0.000223237744649
+# cart_error=0.0890415906906
+# mae=0.0595378726721
+
+fixed_arc="0 3 0 2 0 0 1 2 2 4 0 1 0 0 3 0 3 4 2 2"
+fixed_arc="$fixed_arc 1 4 1 1 2 3 1 0 2 4 0 1 2 1 4 4 2 1 3 2"
+
+python enas/cifar10/main.py \
+  --data_format="NHWC" \
+  --search_for="micro" \
+  --reset_output_dir \
+  --output_dir="2018_09_09_2230_stacking_outputs_translation_final_with_root" \
+  --batch_size=64 \
+  --num_epochs=630 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_fixed_arc="${fixed_arc}" \
+  --child_use_aux_heads \
+  --child_num_layers=10 \
+  --child_out_filters=36 \
+  --child_num_branches=5 \
+  --child_num_cells=5 \
+  --child_keep_prob=0.80 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_l2_reg=2e-4 \
+  --child_lr_cosine \
+  --child_lr_max=1.0 \
+  --child_lr_min=0.0001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --nocontroller_training \
+  --controller_search_whole_channels \
+  --controller_entropy_weight=0.0001 \
+  --controller_train_every=1 \
+  --controller_sync_replicas \
+  --controller_num_aggregate=10 \
+  --controller_train_steps=50 \
+  --controller_lr=0.001 \
+  --controller_tanh_constant=1.50 \
+  --controller_op_tanh_reduce=2.5 \
+  --dataset="stacking" \
+  --height_img 64 \
+  --width_img 64 \
+  --translation_only \ # training on translation component of block stacking poses
+  --max_loss=2 \
+  --use_root \ # based on HyperTree "root" in hypertree code
+  --one_hot_encoding \ # action will be one hot encoded
+  "$@"
diff --git a/scripts/costar_block_stacking_translation_root_final.sh b/scripts/costar_block_stacking_translation_root_final.sh
new file mode 100644
index 0000000..ed82de2
--- /dev/null
+++ b/scripts/costar_block_stacking_translation_root_final.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+export PYTHONPATH="$(pwd)"
+# from 2018_09_14_1818_stacking_outputs_translation_search_with_root.txt
+# [0 1 0 0 0 1 0 4 0 3 0 3]
+# [1 4 0 4 2 0 2 4 3 0 1 3]
+# val_acc=0.0000
+# controller_loss=111188.476562
+# mse=0.000138682997203
+# cart_error=0.0660963505507
+# mae=0.0264506191015
+
+fixed_arc="0 1 0 0 0 1 0 4 0 3 0 3"
+fixed_arc="$fixed_arc 1 4 0 4 2 0 2 4 3 0 1 3"
+
+python enas/cifar10/main.py \
+  --data_format="NHWC" \
+  --search_for="micro" \
+  --reset_output_dir \
+  --output_dir="2018_09_17_1725_stacking_outputs_translation_with_root_final" \
+  --batch_size=64 \
+  --num_epochs=630 \
+  --log_every=50 \
+  --eval_every_epochs=10 \
+  --child_fixed_arc="${fixed_arc}" \
+  --child_use_aux_heads \
+  --child_num_layers=10 \
+  --child_out_filters=36 \
+  --child_num_branches=5 \
+  --child_num_cells=3 \
+  --child_keep_prob=0.80 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_l2_reg=2e-4 \
+  --child_lr_cosine \
+  --child_lr_max=1.0 \
+  --child_lr_min=0.0001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --nocontroller_training \
+  --controller_search_whole_channels \
+  --controller_entropy_weight=0.0001 \
+  --controller_train_every=1 \
+  --controller_sync_replicas \
+  --controller_num_aggregate=10 \
+  --controller_train_steps=50 \
+  --controller_lr=0.001 \
+  --controller_tanh_constant=1.50 \
+  --controller_op_tanh_reduce=2.5 \
+  --dataset="stacking" \
+  --height_img 64 \
+  --width_img 64 \
+  --translation_only \ # training on translation component of block stacking poses
+  --max_loss=2 \
+  --use_root \ # based on HyperTree "root" in hypertree code
+  --one_hot_encoding \ # action will be one hot encoded
+  "$@"
diff --git a/scripts/costar_block_stacking_translation_search.sh b/scripts/costar_block_stacking_translation_search.sh
new file mode 100755
index 0000000..f3668b2
--- /dev/null
+++ b/scripts/costar_block_stacking_translation_search.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="stacking_outputs_translation_5_cells_with_root" --batch_size=16 --num_epochs=640 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=10 --child_out_filters=20 --child_filter_size=5 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=5 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=1.0 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.0001 --controller_train_every=1 --controller_sync_replicas --controller_num_aggregate=10 --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --dataset="stacking" --height_img 64 --width_img 64 --translation_only --max_loss=2 --use_root --one_hot_encoding
\ No newline at end of file
diff --git a/scripts/costar_block_stacking_translation_search_no_root.sh b/scripts/costar_block_stacking_translation_search_no_root.sh
new file mode 100755
index 0000000..22b6765
--- /dev/null
+++ b/scripts/costar_block_stacking_translation_search_no_root.sh
@@ -0,0 +1 @@
+python enas/cifar10/main.py --data_format="NHWC" --search_for="micro" --reset_output_dir --output_dir="stacking_outputs_translation_no_root" --batch_size=16 --num_epochs=640 --log_every=50 --eval_every_epochs=1 --child_use_aux_heads --child_num_layers=8 --child_out_filters=20 --child_filter_size=5 --child_l2_reg=1e-4 --child_num_branches=5 --child_num_cells=3 --child_keep_prob=0.90 --child_drop_path_keep_prob=0.60 --child_optimizer="sgd" --child_lr_cosine --child_lr_max=1.0 --child_lr_min=0.0005 --child_lr_T_0=10 --child_lr_T_mul=2 --controller_training --controller_search_whole_channels --controller_entropy_weight=0.0001 --controller_train_every=1 --controller_sync_replicas --controller_train_steps=30 --controller_lr=0.0035 --controller_tanh_constant=1.10 --controller_op_tanh_reduce=2.5 --dataset="stacking" --height_img 64 --width_img 64 --translation_only --max_loss=2 --one_hot_encoding
\ No newline at end of file